From 501a914e147086365d955e3844e01fbb48a43fde Mon Sep 17 00:00:00 2001 From: Ion Suman <47307091+isum@users.noreply.github.com> Date: Thu, 25 Sep 2025 16:58:07 +0300 Subject: [PATCH 01/40] feat(graph): add Nozzle Flight service client --- Cargo.lock | 477 ++++++++++++++++++++++- Cargo.toml | 6 + graph/Cargo.toml | 6 + graph/src/lib.rs | 2 + graph/src/nozzle/client/flight_client.rs | 202 ++++++++++ graph/src/nozzle/client/mod.rs | 28 ++ graph/src/nozzle/error.rs | 5 + graph/src/nozzle/log.rs | 20 + graph/src/nozzle/mod.rs | 7 + 9 files changed, 745 insertions(+), 8 deletions(-) create mode 100644 graph/src/nozzle/client/flight_client.rs create mode 100644 graph/src/nozzle/client/mod.rs create mode 100644 graph/src/nozzle/error.rs create mode 100644 graph/src/nozzle/log.rs create mode 100644 graph/src/nozzle/mod.rs diff --git a/Cargo.lock b/Cargo.lock index 65392512ce9..a6a2fd81086 100644 --- a/Cargo.lock +++ b/Cargo.lock @@ -36,6 +36,20 @@ version = "1.0.2" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "f26201604c87b1e01bd3d98f8d5d9a8fcbb815e8cedb41ffccbeb4bf593a35fe" +[[package]] +name = "ahash" +version = "0.8.12" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "5a15f179cd60c4584b8a8c596927aadc462e27f2ca70c04e0071964a73ba7a75" +dependencies = [ + "cfg-if 1.0.0", + "const-random", + "getrandom 0.3.1", + "once_cell", + "version_check", + "zerocopy", +] + [[package]] name = "aho-corasick" version = "1.1.3" @@ -145,6 +159,240 @@ version = "0.7.4" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "96d30a06541fbafbc7f82ed10c06164cfbd2c401138f6addd8404629c4b16711" +[[package]] +name = "arrow" +version = "55.0.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "3095aaf545942ff5abd46654534f15b03a90fba78299d661e045e5d587222f0d" +dependencies = [ + "arrow-arith", + "arrow-array", + "arrow-buffer", + "arrow-cast", + "arrow-csv", + "arrow-data", + "arrow-ipc", + "arrow-json", + "arrow-ord", + "arrow-row", + "arrow-schema", + "arrow-select", + "arrow-string", +] + +[[package]] +name = "arrow-arith" +version = "55.2.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "30feb679425110209ae35c3fbf82404a39a4c0436bb3ec36164d8bffed2a4ce4" +dependencies = [ + "arrow-array", + "arrow-buffer", + "arrow-data", + "arrow-schema", + "chrono", + "num", +] + +[[package]] +name = "arrow-array" +version = "55.2.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "70732f04d285d49054a48b72c54f791bb3424abae92d27aafdf776c98af161c8" +dependencies = [ + "ahash", + "arrow-buffer", + "arrow-data", + "arrow-schema", + "chrono", + "half", + "hashbrown 0.15.2", + "num", +] + +[[package]] +name = "arrow-buffer" +version = "55.2.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "169b1d5d6cb390dd92ce582b06b23815c7953e9dfaaea75556e89d890d19993d" +dependencies = [ + "bytes", + "half", + "num", +] + +[[package]] +name = "arrow-cast" +version = "55.2.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "e4f12eccc3e1c05a766cafb31f6a60a46c2f8efec9b74c6e0648766d30686af8" +dependencies = [ + "arrow-array", + "arrow-buffer", + "arrow-data", + "arrow-schema", + "arrow-select", + "atoi", + "base64 0.22.1", + "chrono", + "half", + "lexical-core", + "num", + "ryu", +] + +[[package]] +name = "arrow-csv" +version = "55.2.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "012c9fef3f4a11573b2c74aec53712ff9fdae4a95f4ce452d1bbf088ee00f06b" +dependencies = [ + "arrow-array", + "arrow-cast", + "arrow-schema", + "chrono", + "csv", + "csv-core", + "regex", +] + +[[package]] +name = "arrow-data" +version = "55.2.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "8de1ce212d803199684b658fc4ba55fb2d7e87b213de5af415308d2fee3619c2" +dependencies = [ + "arrow-buffer", + "arrow-schema", + "half", + "num", +] + +[[package]] +name = "arrow-flight" +version = "55.0.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "e2e0fad280f41a918d53ba48288a246ff04202d463b3b380fbc0edecdcb52cfd" +dependencies = [ + "arrow-arith", + "arrow-array", + "arrow-buffer", + "arrow-cast", + "arrow-data", + "arrow-ipc", + "arrow-ord", + "arrow-row", + "arrow-schema", + "arrow-select", + "arrow-string", + "base64 0.22.1", + "bytes", + "futures 0.3.31", + "once_cell", + "paste", + "prost", + "prost-types", + "tonic", +] + +[[package]] +name = "arrow-ipc" +version = "55.2.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "d9ea5967e8b2af39aff5d9de2197df16e305f47f404781d3230b2dc672da5d92" +dependencies = [ + "arrow-array", + "arrow-buffer", + "arrow-data", + "arrow-schema", + "flatbuffers", +] + +[[package]] +name = "arrow-json" +version = "55.2.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "5709d974c4ea5be96d900c01576c7c0b99705f4a3eec343648cb1ca863988a9c" +dependencies = [ + "arrow-array", + "arrow-buffer", + "arrow-cast", + "arrow-data", + "arrow-schema", + "chrono", + "half", + "indexmap 2.11.4", + "lexical-core", + "memchr", + "num", + "serde", + "serde_json", + "simdutf8", +] + +[[package]] +name = "arrow-ord" +version = "55.2.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "6506e3a059e3be23023f587f79c82ef0bcf6d293587e3272d20f2d30b969b5a7" +dependencies = [ + "arrow-array", + "arrow-buffer", + "arrow-data", + "arrow-schema", + "arrow-select", +] + +[[package]] +name = "arrow-row" +version = "55.2.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "52bf7393166beaf79b4bed9bfdf19e97472af32ce5b6b48169d321518a08cae2" +dependencies = [ + "arrow-array", + "arrow-buffer", + "arrow-data", + "arrow-schema", + "half", +] + +[[package]] +name = "arrow-schema" +version = "55.2.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "af7686986a3bf2254c9fb130c623cdcb2f8e1f15763e7c71c310f0834da3d292" + +[[package]] +name = "arrow-select" +version = "55.2.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "dd2b45757d6a2373faa3352d02ff5b54b098f5e21dccebc45a21806bc34501e5" +dependencies = [ + "ahash", + "arrow-array", + "arrow-buffer", + "arrow-data", + "arrow-schema", + "num", +] + +[[package]] +name = "arrow-string" +version = "55.2.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "0377d532850babb4d927a06294314b316e23311503ed580ec6ce6a0158f49d40" +dependencies = [ + "arrow-array", + "arrow-buffer", + "arrow-data", + "arrow-schema", + "arrow-select", + "memchr", + "num", + "regex", + "regex-syntax", +] + [[package]] name = "ascii_utils" version = "0.9.3" @@ -297,6 +545,15 @@ dependencies = [ "syn 2.0.106", ] +[[package]] +name = "atoi" +version = "2.0.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "f28d99ec8bfea296261ca1af174f24225171fea9664ba9003cbebee704810528" +dependencies = [ + "num-traits", +] + [[package]] name = "atomic-waker" version = "1.1.2" @@ -765,6 +1022,26 @@ dependencies = [ "windows-sys 0.59.0", ] +[[package]] +name = "const-random" +version = "0.1.18" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "87e00182fe74b066627d63b85fd550ac2998d4b0bd86bfed477a0ae4c7c71359" +dependencies = [ + "const-random-macro", +] + +[[package]] +name = "const-random-macro" +version = "0.1.16" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "f9d839f2a20b0aee515dc581a6172f2321f96cab76c1a38a4c584a194955390e" +dependencies = [ + "getrandom 0.2.15", + "once_cell", + "tiny-keccak 2.0.2", +] + [[package]] name = "constant_time_eq" version = "0.1.5" @@ -1624,6 +1901,16 @@ version = "0.5.7" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "1d674e81391d1e1ab681a28d99df07927c6d4aa5b027d7da16ba32d1d21ecd99" +[[package]] +name = "flatbuffers" +version = "25.9.23" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "09b6620799e7340ebd9968d2e0708eb82cf1971e9a16821e2091b6d6e475eed5" +dependencies = [ + "bitflags 2.9.0", + "rustc_version", +] + [[package]] name = "flate2" version = "1.0.30" @@ -1920,7 +2207,10 @@ name = "graph" version = "0.36.0" dependencies = [ "Inflector", + "ahash", "anyhow", + "arrow", + "arrow-flight", "async-stream", "async-trait", "atomic_refcell", @@ -1952,6 +2242,7 @@ dependencies = [ "hyper 1.7.0", "hyper-util", "itertools", + "lazy-regex", "lazy_static", "lru_time_cache", "maplit", @@ -2400,6 +2691,18 @@ dependencies = [ "tracing", ] +[[package]] +name = "half" +version = "2.7.1" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "6ea2d84b969582b4b1864a92dc5d27cd2b77b622a8d79306834f1be5ba20d84b" +dependencies = [ + "cfg-if 1.0.0", + "crunchy", + "num-traits", + "zerocopy", +] + [[package]] name = "handlebars" version = "5.1.2" @@ -3250,6 +3553,29 @@ dependencies = [ "libc", ] +[[package]] +name = "lazy-regex" +version = "3.4.2" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "191898e17ddee19e60bccb3945aa02339e81edd4a8c50e21fd4d48cdecda7b29" +dependencies = [ + "lazy-regex-proc_macros", + "once_cell", + "regex", +] + +[[package]] +name = "lazy-regex-proc_macros" +version = "3.4.2" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "c35dc8b0da83d1a9507e12122c80dea71a9c7c613014347392483a83ea593e04" +dependencies = [ + "proc-macro2", + "quote", + "regex", + "syn 2.0.106", +] + [[package]] name = "lazy_static" version = "1.5.0" @@ -3268,6 +3594,63 @@ version = "0.1.0" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "09edd9e8b54e49e587e4f6295a7d29c3ea94d469cb40ab8ca70b288248a81db2" +[[package]] +name = "lexical-core" +version = "1.0.6" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "7d8d125a277f807e55a77304455eb7b1cb52f2b18c143b60e766c120bd64a594" +dependencies = [ + "lexical-parse-float", + "lexical-parse-integer", + "lexical-util", + "lexical-write-float", + "lexical-write-integer", +] + +[[package]] +name = "lexical-parse-float" +version = "1.0.6" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "52a9f232fbd6f550bc0137dcb5f99ab674071ac2d690ac69704593cb4abbea56" +dependencies = [ + "lexical-parse-integer", + "lexical-util", +] + +[[package]] +name = "lexical-parse-integer" +version = "1.0.6" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "9a7a039f8fb9c19c996cd7b2fcce303c1b2874fe1aca544edc85c4a5f8489b34" +dependencies = [ + "lexical-util", +] + +[[package]] +name = "lexical-util" +version = "1.0.7" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "2604dd126bb14f13fb5d1bd6a66155079cb9fa655b37f875b3a742c705dbed17" + +[[package]] +name = "lexical-write-float" +version = "1.0.6" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "50c438c87c013188d415fbabbb1dceb44249ab81664efbd31b14ae55dabb6361" +dependencies = [ + "lexical-util", + "lexical-write-integer", +] + +[[package]] +name = "lexical-write-integer" +version = "1.0.6" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "409851a618475d2d5796377cad353802345cba92c867d9fbcde9cf4eac4e14df" +dependencies = [ + "lexical-util", +] + [[package]] name = "libc" version = "0.2.175" @@ -3543,6 +3926,20 @@ version = "2.0.0" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "5e0826a989adedc2a244799e823aece04662b66609d96af8dff7ac6df9a8925d" +[[package]] +name = "num" +version = "0.4.3" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "35bd024e8b2ff75562e5f34e7f4905839deb4b22955ef5e73d2fea1b9813cb23" +dependencies = [ + "num-bigint 0.4.6", + "num-complex", + "num-integer", + "num-iter", + "num-rational", + "num-traits", +] + [[package]] name = "num-bigint" version = "0.2.6" @@ -3565,6 +3962,15 @@ dependencies = [ "num-traits", ] +[[package]] +name = "num-complex" +version = "0.4.6" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "73f88a1307638156682bada9d7604135552957b7818057dcef22705b4d509495" +dependencies = [ + "num-traits", +] + [[package]] name = "num-conv" version = "0.1.0" @@ -3580,6 +3986,28 @@ dependencies = [ "num-traits", ] +[[package]] +name = "num-iter" +version = "0.1.45" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "1429034a0490724d0075ebb2bc9e875d6503c3cf69e235a8941aa757d83ef5bf" +dependencies = [ + "autocfg", + "num-integer", + "num-traits", +] + +[[package]] +name = "num-rational" +version = "0.4.2" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "f83d14da390562dca69fc84082e73e548e1ad308d24accdedd2720017cb37824" +dependencies = [ + "num-bigint 0.4.6", + "num-integer", + "num-traits", +] + [[package]] name = "num-traits" version = "0.2.19" @@ -3587,6 +4015,7 @@ source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "071dfc062690e90b734c0b2273ce72ad0ffa95f0c74596bc250dcfd960262841" dependencies = [ "autocfg", + "libm", ] [[package]] @@ -3649,9 +4078,9 @@ dependencies = [ [[package]] name = "once_cell" -version = "1.19.0" +version = "1.21.3" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "3fdb12b2476b595f9358c5161aa467c2438859caa136dec86c26fdd2efe17b92" +checksum = "42f5e15c9953c5e4ccceeb2e7382a716482c34515315f7b03532b8b4e8393d2d" [[package]] name = "opaque-debug" @@ -3783,6 +4212,12 @@ dependencies = [ "windows-targets 0.52.6", ] +[[package]] +name = "paste" +version = "1.0.15" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "57c0d7b74b563b49d38dae00a0c37d4d6de9b432382b2892f0574ddcae73fd0a" + [[package]] name = "percent-encoding" version = "2.3.2" @@ -4487,9 +4922,9 @@ dependencies = [ [[package]] name = "regex" -version = "1.10.5" +version = "1.12.2" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "b91213439dad192326a0d7c6ee3955910425f441d7038e0d6933b0aec5c4517f" +checksum = "843bc0191f75f3e22651ae5f1e72939ab2f72a4bc30fa80a066bd66edefc24d4" dependencies = [ "aho-corasick", "memchr", @@ -4499,9 +4934,9 @@ dependencies = [ [[package]] name = "regex-automata" -version = "0.4.7" +version = "0.4.13" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "38caf58cc5ef2fed281f89292ef23f6365465ed9a41b7a7754eb4e26496c92df" +checksum = "5276caf25ac86c8d810222b3dbb938e512c55c6831a10f3e6ed1c93b84041f1c" dependencies = [ "aho-corasick", "memchr", @@ -4510,9 +4945,9 @@ dependencies = [ [[package]] name = "regex-syntax" -version = "0.8.4" +version = "0.8.8" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "7a66a03ae7c801facd77a29370b4faec201768915ac14a721ba36f20bc9c209b" +checksum = "7a2d987857b319362043e95f5353c0535c1f58eec5336fdfcf626430af7def58" [[package]] name = "reqwest" @@ -5040,6 +5475,12 @@ dependencies = [ "libc", ] +[[package]] +name = "simdutf8" +version = "0.1.5" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "e3a9fe34e3e7a50316060351f37187a3f546bce95496156754b601a5fa71b76e" + [[package]] name = "siphasher" version = "0.3.11" @@ -7344,6 +7785,26 @@ dependencies = [ "synstructure", ] +[[package]] +name = "zerocopy" +version = "0.8.28" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "43fa6694ed34d6e57407afbccdeecfa268c470a7d2a5b0cf49ce9fcc345afb90" +dependencies = [ + "zerocopy-derive", +] + +[[package]] +name = "zerocopy-derive" +version = "0.8.28" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "c640b22cd9817fae95be82f0d2f90b11f7605f6c319d16705c459b27ac2cbc26" +dependencies = [ + "proc-macro2", + "quote", + "syn 2.0.106", +] + [[package]] name = "zerofrom" version = "0.1.6" diff --git a/Cargo.toml b/Cargo.toml index c7c25b817a5..0845798240e 100644 --- a/Cargo.toml +++ b/Cargo.toml @@ -98,6 +98,12 @@ substreams-entity-change = "2" substreams-near-core = "=0.10.2" rand = { version = "0.9.2", features = ["os_rng"] } +# Dependencies related to Nozzle Subgraphs +ahash = "0.8.11" +arrow = { version = "=55.0.0" } +arrow-flight = { version = "=55.0.0", features = ["flight-sql-experimental"] } +lazy-regex = "3.4.1" + # Incremental compilation on Rust 1.58 causes an ICE on build. As soon as graph node builds again, these can be removed. [profile.test] incremental = false diff --git a/graph/Cargo.toml b/graph/Cargo.toml index 44e004be00c..23df37d0fd1 100644 --- a/graph/Cargo.toml +++ b/graph/Cargo.toml @@ -103,6 +103,12 @@ serde_plain = "1.0.2" csv = "1.3.1" object_store = { version = "0.12.3", features = ["gcp"] } +# Dependencies related to Nozzle Subgraphs +ahash.workspace = true +arrow-flight.workspace = true +arrow.workspace = true +lazy-regex.workspace = true + [dev-dependencies] clap.workspace = true maplit = "1.0.2" diff --git a/graph/src/lib.rs b/graph/src/lib.rs index 05407603f48..03ee57b8e13 100644 --- a/graph/src/lib.rs +++ b/graph/src/lib.rs @@ -37,6 +37,8 @@ pub mod env; pub mod ipfs; +pub mod nozzle; + /// Wrapper for spawning tasks that abort on panic, which is our default. mod task_spawn; pub use task_spawn::{ diff --git a/graph/src/nozzle/client/flight_client.rs b/graph/src/nozzle/client/flight_client.rs new file mode 100644 index 00000000000..ea89aba6005 --- /dev/null +++ b/graph/src/nozzle/client/flight_client.rs @@ -0,0 +1,202 @@ +use std::{ + hash::{Hash, Hasher}, + time::Duration, +}; + +use ahash::AHasher; +use arrow::{array::RecordBatch, datatypes::Schema, error::ArrowError}; +use arrow_flight::{ + error::FlightError, flight_service_client::FlightServiceClient, + sql::client::FlightSqlServiceClient, +}; +use async_stream::try_stream; +use bytes::Bytes; +use futures03::{future::BoxFuture, stream::BoxStream, StreamExt}; +use lazy_regex::regex_is_match; +use slog::{debug, Logger}; +use thiserror::Error; +use tonic::transport::{Channel, ClientTlsConfig, Endpoint}; + +use crate::{ + nozzle::{ + client::Client, + error, + log::{one_line, Logger as _}, + }, + prelude::CheapClone, +}; + +/// A client for the Nozzle Flight gRPC service. +/// +/// This client connects to a Nozzle server and executes SQL queries +/// using the Apache Arrow Flight protocol. +pub struct FlightClient { + channel: Channel, +} + +#[derive(Debug, Error)] +pub enum Error { + // Address excluded to avoid leaking sensitive details in logs + #[error("invalid address")] + InvalidAddress, + + #[error("service failed: {0:#}")] + Service(#[source] ArrowError), + + #[error("stream failed: {0:#}")] + Stream(#[source] FlightError), +} + +impl FlightClient { + /// Constructs a new Nozzle client connected to the specified Nozzle Flight service address. + pub fn new(addr: impl Into) -> Result { + let addr: Bytes = addr.into(); + let is_https = std::str::from_utf8(&addr).map_or(false, |a| a.starts_with("https://")); + + let mut endpoint = Endpoint::from_shared(addr) + .map_err(|_e| Error::InvalidAddress)? + .tcp_keepalive(Some(Duration::from_secs(30))) + .keep_alive_while_idle(true) + .http2_adaptive_window(true) + .initial_connection_window_size(Some(32 * 1024 * 1024)) + .initial_stream_window_size(Some(16 * 1024 * 1024)) + .connect_timeout(Duration::from_secs(10)); + + if is_https { + let mut tls_config = ClientTlsConfig::new(); + tls_config = tls_config.with_native_roots(); + + endpoint = endpoint.tls_config(tls_config).unwrap(); + } + + Ok(Self { + channel: endpoint.connect_lazy(), + }) + } + + fn raw_client(&self) -> FlightSqlServiceClient { + let channel = self.channel.cheap_clone(); + let client = FlightServiceClient::new(channel) + .max_encoding_message_size(256 * 1024 * 1024) + .max_decoding_message_size(256 * 1024 * 1024); + + FlightSqlServiceClient::new_from_inner(client) + } +} + +impl Client for FlightClient { + type Error = Error; + + fn schema( + &self, + logger: &Logger, + query: impl ToString, + ) -> BoxFuture<'static, Result> { + let logger = logger.component("nozzle::FlightClient"); + let mut raw_client = self.raw_client(); + let query = query.to_string(); + + Box::pin(async move { + const TXN_ID: Option = None; + + debug!(logger, "Executing SQL query"; + "query" => &*one_line(&query) + ); + + let flight_info = raw_client + .execute(query, TXN_ID) + .await + .map_err(Error::Service)?; + + flight_info.try_decode_schema().map_err(Error::Service) + }) + } + + fn query( + &self, + logger: &Logger, + query: impl ToString, + ) -> BoxStream<'static, Result> { + let logger = logger.component("nozzle::FlightClient"); + let mut raw_client = self.raw_client(); + let query = query.to_string(); + let query_id = query_id(&query); + + try_stream! { + const TXN_ID: Option = None; + + debug!(logger, "Executing SQL query"; + "query" => &*one_line(&query), + "query_id" => query_id + ); + + let flight_info = raw_client + .execute(query, TXN_ID) + .await + .map_err(Error::Service)?; + + for endpoint in flight_info.endpoint { + let Some(ticket) = endpoint.ticket else { + continue; + }; + + let mut stream = raw_client.do_get(ticket).await.map_err(Error::Service)?; + let mut batch_index = 0u32; + + while let Some(batch_result) = stream.next().await { + debug!(logger, "Received a new record batch"; + "query_id" => query_id, + "batch_index" => batch_index, + "num_rows" => batch_result.as_ref().map_or(0, |b| b.num_rows()), + "memory_size_bytes" => batch_result.as_ref().map_or(0, |b| b.get_array_memory_size()) + ); + + let record_batch = batch_result.map_err(Error::Stream)?; + yield record_batch; + + batch_index += 1; + } + + debug!(logger, "Query execution completed successfully"; + "query_id" => query_id, + "batch_count" => batch_index + ); + } + } + .boxed() + } +} + +impl error::IsDeterministic for Error { + fn is_deterministic(&self) -> bool { + static PATTERNS: &[&str] = &[ + r#", message: "SQL parse error:"#, + r#", message: "error looking up datasets:"#, + r#", message: "planning error:"#, + ]; + + let msg = self.to_string(); + + for &pattern in PATTERNS { + if msg.contains(pattern) { + return true; + } + } + + if regex_is_match!(r#", message: "dataset '.*?' not found, full error:"#, &msg) { + return true; + } + + false + } +} + +/// Generates an ID from a SQL query for log correlation. +/// +/// The ID allows connecting related logs without including the full SQL +/// query in every log message. +fn query_id(query: &str) -> u32 { + let mut hasher = AHasher::default(); + query.hash(&mut hasher); + hasher.finish() as u32 +} diff --git a/graph/src/nozzle/client/mod.rs b/graph/src/nozzle/client/mod.rs new file mode 100644 index 00000000000..6567af255dd --- /dev/null +++ b/graph/src/nozzle/client/mod.rs @@ -0,0 +1,28 @@ +pub mod flight_client; + +use std::error::Error; + +use arrow::{array::RecordBatch, datatypes::Schema}; +use futures03::{future::BoxFuture, stream::BoxStream}; +use slog::Logger; + +use crate::nozzle::error; + +/// Client for connecting to Nozzle core and executing SQL queries. +pub trait Client { + type Error: Error + error::IsDeterministic; + + /// Executes a SQL query and returns the corresponding schema. + fn schema( + &self, + logger: &Logger, + query: impl ToString, + ) -> BoxFuture<'static, Result>; + + /// Executes a SQL query and streams the requested data in batches. + fn query( + &self, + logger: &Logger, + query: impl ToString, + ) -> BoxStream<'static, Result>; +} diff --git a/graph/src/nozzle/error.rs b/graph/src/nozzle/error.rs new file mode 100644 index 00000000000..3489d7a94de --- /dev/null +++ b/graph/src/nozzle/error.rs @@ -0,0 +1,5 @@ +/// Checks whether errors are deterministic. +pub trait IsDeterministic { + /// Returns `true` if the error is deterministic. + fn is_deterministic(&self) -> bool; +} diff --git a/graph/src/nozzle/log.rs b/graph/src/nozzle/log.rs new file mode 100644 index 00000000000..f494a46a8de --- /dev/null +++ b/graph/src/nozzle/log.rs @@ -0,0 +1,20 @@ +use std::borrow::Cow; + +use lazy_regex::regex_replace_all; + +/// Extends the [slog::Logger] with methods commonly used in Nozzle modules +pub trait Logger { + /// Creates a new child logger scoped to a specific component + fn component(&self, name: &'static str) -> slog::Logger; +} + +impl Logger for slog::Logger { + fn component(&self, name: &'static str) -> slog::Logger { + self.new(slog::o!("component" => name)) + } +} + +/// Removes newlines and extra spaces from a string +pub fn one_line<'a>(s: &'a str) -> Cow<'a, str> { + regex_replace_all!(r"(\\r)?(\\n)?\s+", s, " ") +} diff --git a/graph/src/nozzle/mod.rs b/graph/src/nozzle/mod.rs new file mode 100644 index 00000000000..bd90587fbb1 --- /dev/null +++ b/graph/src/nozzle/mod.rs @@ -0,0 +1,7 @@ +//! This module contains the functionality required to support Nozzle Subgraphs. + +pub mod client; +pub mod error; +pub mod log; + +pub use self::client::{flight_client::FlightClient, Client}; From 941f4ec06272bb4df4a6aad4bc18161c2d74b96b Mon Sep 17 00:00:00 2001 From: Ion Suman <47307091+isum@users.noreply.github.com> Date: Thu, 25 Sep 2025 16:58:07 +0300 Subject: [PATCH 02/40] feat(graph): add Nozzle stream aggregator --- Cargo.lock | 799 +++++++++++++++++- Cargo.toml | 1 + graph/Cargo.toml | 1 + graph/src/nozzle/common/mod.rs | 4 + graph/src/nozzle/mod.rs | 2 + graph/src/nozzle/stream_aggregator/error.rs | 49 ++ graph/src/nozzle/stream_aggregator/mod.rs | 198 +++++ .../record_batch/aggregator.rs | 230 +++++ .../stream_aggregator/record_batch/buffer.rs | 209 +++++ .../stream_aggregator/record_batch/decoder.rs | 94 +++ .../record_batch/group_data.rs | 88 ++ .../stream_aggregator/record_batch/mod.rs | 38 + 12 files changed, 1699 insertions(+), 14 deletions(-) create mode 100644 graph/src/nozzle/common/mod.rs create mode 100644 graph/src/nozzle/stream_aggregator/error.rs create mode 100644 graph/src/nozzle/stream_aggregator/mod.rs create mode 100644 graph/src/nozzle/stream_aggregator/record_batch/aggregator.rs create mode 100644 graph/src/nozzle/stream_aggregator/record_batch/buffer.rs create mode 100644 graph/src/nozzle/stream_aggregator/record_batch/decoder.rs create mode 100644 graph/src/nozzle/stream_aggregator/record_batch/group_data.rs create mode 100644 graph/src/nozzle/stream_aggregator/record_batch/mod.rs diff --git a/Cargo.lock b/Cargo.lock index a6a2fd81086..85ca7d28b55 100644 --- a/Cargo.lock +++ b/Cargo.lock @@ -65,6 +65,279 @@ version = "0.2.21" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "683d7910e743518b0e34f1186f92494becacb047c7b6bf616c96772180fef923" +[[package]] +name = "alloy" +version = "1.1.2" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "9e15860af634cad451f598712c24ca7fd9b45d84fff68ab8d4967567fa996c64" +dependencies = [ + "alloy-consensus", + "alloy-core", + "alloy-eips", + "alloy-serde", + "alloy-trie", +] + +[[package]] +name = "alloy-consensus" +version = "1.1.2" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "8b6440213a22df93a87ed512d2f668e7dc1d62a05642d107f82d61edc9e12370" +dependencies = [ + "alloy-eips", + "alloy-primitives", + "alloy-rlp", + "alloy-serde", + "alloy-trie", + "alloy-tx-macros", + "auto_impl", + "c-kzg", + "derive_more 2.0.1", + "either", + "k256", + "once_cell", + "rand 0.8.5", + "secp256k1 0.30.0", + "serde", + "serde_json", + "thiserror 2.0.16", +] + +[[package]] +name = "alloy-core" +version = "1.4.1" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "5ca96214615ec8cf3fa2a54b32f486eb49100ca7fe7eb0b8c1137cd316e7250a" +dependencies = [ + "alloy-json-abi", + "alloy-primitives", + "alloy-sol-types", +] + +[[package]] +name = "alloy-eip2124" +version = "0.2.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "741bdd7499908b3aa0b159bba11e71c8cddd009a2c2eb7a06e825f1ec87900a5" +dependencies = [ + "alloy-primitives", + "alloy-rlp", + "crc", + "serde", + "thiserror 2.0.16", +] + +[[package]] +name = "alloy-eip2930" +version = "0.2.3" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "9441120fa82df73e8959ae0e4ab8ade03de2aaae61be313fbf5746277847ce25" +dependencies = [ + "alloy-primitives", + "alloy-rlp", + "serde", +] + +[[package]] +name = "alloy-eip7702" +version = "0.6.3" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "2919c5a56a1007492da313e7a3b6d45ef5edc5d33416fdec63c0d7a2702a0d20" +dependencies = [ + "alloy-primitives", + "alloy-rlp", + "serde", + "thiserror 2.0.16", +] + +[[package]] +name = "alloy-eips" +version = "1.1.2" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "4bd2c7ae05abcab4483ce821f12f285e01c0b33804e6883dd9ca1569a87ee2be" +dependencies = [ + "alloy-eip2124", + "alloy-eip2930", + "alloy-eip7702", + "alloy-primitives", + "alloy-rlp", + "alloy-serde", + "auto_impl", + "c-kzg", + "derive_more 2.0.1", + "either", + "serde", + "serde_with", + "sha2", + "thiserror 2.0.16", +] + +[[package]] +name = "alloy-json-abi" +version = "1.4.1" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "5513d5e6bd1cba6bdcf5373470f559f320c05c8c59493b6e98912fbe6733943f" +dependencies = [ + "alloy-primitives", + "alloy-sol-type-parser", + "serde", + "serde_json", +] + +[[package]] +name = "alloy-primitives" +version = "1.4.1" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "355bf68a433e0fd7f7d33d5a9fc2583fde70bf5c530f63b80845f8da5505cf28" +dependencies = [ + "alloy-rlp", + "bytes", + "cfg-if 1.0.0", + "const-hex", + "derive_more 2.0.1", + "hashbrown 0.16.1", + "indexmap 2.11.4", + "itoa", + "paste", + "rand 0.9.2", + "ruint", + "serde", + "tiny-keccak 2.0.2", +] + +[[package]] +name = "alloy-rlp" +version = "0.3.12" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "5f70d83b765fdc080dbcd4f4db70d8d23fe4761f2f02ebfa9146b833900634b4" +dependencies = [ + "alloy-rlp-derive", + "arrayvec 0.7.4", + "bytes", +] + +[[package]] +name = "alloy-rlp-derive" +version = "0.3.12" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "64b728d511962dda67c1bc7ea7c03736ec275ed2cf4c35d9585298ac9ccf3b73" +dependencies = [ + "proc-macro2", + "quote", + "syn 2.0.106", +] + +[[package]] +name = "alloy-serde" +version = "1.1.2" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "a6f180c399ca7c1e2fe17ea58343910cad0090878a696ff5a50241aee12fc529" +dependencies = [ + "alloy-primitives", + "serde", + "serde_json", +] + +[[package]] +name = "alloy-sol-macro" +version = "1.4.1" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "f3ce480400051b5217f19d6e9a82d9010cdde20f1ae9c00d53591e4a1afbb312" +dependencies = [ + "alloy-sol-macro-expander", + "alloy-sol-macro-input", + "proc-macro-error2", + "proc-macro2", + "quote", + "syn 2.0.106", +] + +[[package]] +name = "alloy-sol-macro-expander" +version = "1.4.1" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "6d792e205ed3b72f795a8044c52877d2e6b6e9b1d13f431478121d8d4eaa9028" +dependencies = [ + "alloy-json-abi", + "alloy-sol-macro-input", + "const-hex", + "heck 0.5.0", + "indexmap 2.11.4", + "proc-macro-error2", + "proc-macro2", + "quote", + "syn 2.0.106", + "syn-solidity", + "tiny-keccak 2.0.2", +] + +[[package]] +name = "alloy-sol-macro-input" +version = "1.4.1" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "0bd1247a8f90b465ef3f1207627547ec16940c35597875cdc09c49d58b19693c" +dependencies = [ + "alloy-json-abi", + "const-hex", + "dunce", + "heck 0.5.0", + "macro-string", + "proc-macro2", + "quote", + "serde_json", + "syn 2.0.106", + "syn-solidity", +] + +[[package]] +name = "alloy-sol-type-parser" +version = "1.4.1" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "954d1b2533b9b2c7959652df3076954ecb1122a28cc740aa84e7b0a49f6ac0a9" +dependencies = [ + "serde", + "winnow 0.7.13", +] + +[[package]] +name = "alloy-sol-types" +version = "1.4.1" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "70319350969a3af119da6fb3e9bddb1bce66c9ea933600cb297c8b1850ad2a3c" +dependencies = [ + "alloy-json-abi", + "alloy-primitives", + "alloy-sol-macro", +] + +[[package]] +name = "alloy-trie" +version = "0.9.1" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "e3412d52bb97c6c6cc27ccc28d4e6e8cf605469101193b50b0bd5813b1f990b5" +dependencies = [ + "alloy-primitives", + "alloy-rlp", + "arrayvec 0.7.4", + "derive_more 2.0.1", + "nybbles", + "serde", + "smallvec", + "tracing", +] + +[[package]] +name = "alloy-tx-macros" +version = "1.1.2" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "ae109e33814b49fc0a62f2528993aa8a2dd346c26959b151f05441dc0b9da292" +dependencies = [ + "darling 0.21.3", + "proc-macro2", + "quote", + "syn 2.0.106", +] + [[package]] name = "android_system_properties" version = "0.1.5" @@ -158,6 +431,9 @@ name = "arrayvec" version = "0.7.4" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "96d30a06541fbafbc7f82ed10c06164cfbd2c401138f6addd8404629c4b16711" +dependencies = [ + "serde", +] [[package]] name = "arrow" @@ -468,7 +744,7 @@ checksum = "fd45deb3dbe5da5cdb8d6a670a7736d735ba65b455328440f236dfb113727a3d" dependencies = [ "Inflector", "async-graphql-parser", - "darling", + "darling 0.20.10", "proc-macro-crate", "proc-macro2", "quote", @@ -577,6 +853,17 @@ dependencies = [ "winapi", ] +[[package]] +name = "auto_impl" +version = "1.3.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "ffdcb70bdbc4d478427380519163274ac86e52916e10f0a8889adf0f96d3fee7" +dependencies = [ + "proc-macro2", + "quote", + "syn 2.0.106", +] + [[package]] name = "autocfg" version = "1.3.0" @@ -717,6 +1004,12 @@ version = "0.2.11" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "4cbbc9d0964165b47557570cce6c952866c2678457aca742aafc9fb771d30270" +[[package]] +name = "base16ct" +version = "0.2.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "4c7f02d4ea65f2c1853089ffd8d2787bdbc63de2f0d29dedbcf8ccdfa0ccd4cf" + [[package]] name = "base64" version = "0.13.1" @@ -735,6 +1028,12 @@ version = "0.22.1" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "72b3254f16251a8381aa12e40e3c4d2f0199f8c6508fbecb9d91f575e0fbb8c6" +[[package]] +name = "base64ct" +version = "1.8.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "55248b47b0caf0546f7988906588779981c43bb1bc9d0c44087278f80cdb44ba" + [[package]] name = "beef" version = "0.5.2" @@ -767,6 +1066,22 @@ dependencies = [ "num-traits", ] +[[package]] +name = "bitcoin-io" +version = "0.1.3" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "0b47c4ab7a93edb0c7198c5535ed9b52b63095f4e9b45279c6736cec4b856baf" + +[[package]] +name = "bitcoin_hashes" +version = "0.14.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "bb18c03d0db0247e147a21a6faafd5a7eb851c743db062de72018b6b7e8e4d16" +dependencies = [ + "bitcoin-io", + "hex-conservative", +] + [[package]] name = "bitflags" version = "1.3.2" @@ -837,6 +1152,18 @@ dependencies = [ "generic-array", ] +[[package]] +name = "blst" +version = "0.3.16" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "dcdb4c7013139a150f9fc55d123186dbfaba0d912817466282c73ac49e71fb45" +dependencies = [ + "cc", + "glob", + "threadpool", + "zeroize", +] + [[package]] name = "bs58" version = "0.4.0" @@ -892,6 +1219,21 @@ dependencies = [ "serde", ] +[[package]] +name = "c-kzg" +version = "2.1.5" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "e00bf4b112b07b505472dbefd19e37e53307e2bfed5a79e0cc161d58ccd0e687" +dependencies = [ + "blst", + "cc", + "glob", + "hex", + "libc", + "once_cell", + "serde", +] + [[package]] name = "cc" version = "1.2.43" @@ -1022,6 +1364,24 @@ dependencies = [ "windows-sys 0.59.0", ] +[[package]] +name = "const-hex" +version = "1.17.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "3bb320cac8a0750d7f25280aa97b09c26edfe161164238ecbbb31092b079e735" +dependencies = [ + "cfg-if 1.0.0", + "cpufeatures", + "proptest", + "serde_core", +] + +[[package]] +name = "const-oid" +version = "0.9.6" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "c2459377285ad874054d797f3ccebf984978aa39129f6eafde5cdc8315b612f8" + [[package]] name = "const-random" version = "0.1.18" @@ -1258,6 +1618,21 @@ version = "0.120.2" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "02e3f4d783a55c64266d17dc67d2708852235732a100fc40dd9f1051adc64d7b" +[[package]] +name = "crc" +version = "3.3.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "9710d3b3739c2e349eb44fe848ad0b7c8cb1e42bd87ee49371df2f7acaf3e675" +dependencies = [ + "crc-catalog", +] + +[[package]] +name = "crc-catalog" +version = "2.4.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "19d374276b40fb8bbdee95aef7c7fa6b5316ec764510eb64b8dd0e2ed0d7e7f5" + [[package]] name = "crc32fast" version = "1.4.2" @@ -1329,6 +1704,18 @@ version = "0.2.2" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "7a81dae078cea95a014a339291cec439d2f232ebe854a9d672b796c6afafa9b7" +[[package]] +name = "crypto-bigint" +version = "0.5.5" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "0dc92fb57ca44df6db8059111ab3af99a63d5d0f8375d9972e319a379c6bab76" +dependencies = [ + "generic-array", + "rand_core 0.6.4", + "subtle", + "zeroize", +] + [[package]] name = "crypto-common" version = "0.1.6" @@ -1376,20 +1763,45 @@ version = "0.20.10" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "6f63b86c8a8826a49b8c21f08a2d07338eec8d900540f8630dc76284be802989" dependencies = [ - "darling_core", - "darling_macro", + "darling_core 0.20.10", + "darling_macro 0.20.10", +] + +[[package]] +name = "darling" +version = "0.21.3" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "9cdf337090841a411e2a7f3deb9187445851f91b309c0c0a29e05f74a00a48c0" +dependencies = [ + "darling_core 0.21.3", + "darling_macro 0.21.3", +] + +[[package]] +name = "darling_core" +version = "0.20.10" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "95133861a8032aaea082871032f5815eb9e98cef03fa916ab4500513994df9e5" +dependencies = [ + "fnv", + "ident_case", + "proc-macro2", + "quote", + "strsim", + "syn 2.0.106", ] [[package]] name = "darling_core" -version = "0.20.10" +version = "0.21.3" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "95133861a8032aaea082871032f5815eb9e98cef03fa916ab4500513994df9e5" +checksum = "1247195ecd7e3c85f83c8d2a366e4210d588e802133e1e355180a9870b517ea4" dependencies = [ "fnv", "ident_case", "proc-macro2", "quote", + "serde", "strsim", "syn 2.0.106", ] @@ -1400,7 +1812,18 @@ version = "0.20.10" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "d336a2a514f6ccccaa3e09b02d41d35330c07ddf03a62165fcec10bb561c7806" dependencies = [ - "darling_core", + "darling_core 0.20.10", + "quote", + "syn 2.0.106", +] + +[[package]] +name = "darling_macro" +version = "0.21.3" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "d38308df82d1080de0afee5d069fa14b0326a88c14f15c5ccda35b4a6c414c81" +dependencies = [ + "darling_core 0.21.3", "quote", "syn 2.0.106", ] @@ -1463,6 +1886,16 @@ version = "0.2.1" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "930c7171c8df9fb1782bdf9b918ed9ed2d33d1d22300abb754f9085bc48bf8e8" +[[package]] +name = "der" +version = "0.7.10" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "e7c1832837b905bbfb5101e07cc24c8deddf52f93225eee6ead5f4d63d53ddcb" +dependencies = [ + "const-oid", + "zeroize", +] + [[package]] name = "deranged" version = "0.3.11" @@ -1621,6 +2054,7 @@ source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "9ed9a281f7bc9b7576e61468ba615a66a5c8cfdff42420a70aa82701a3b1e292" dependencies = [ "block-buffer 0.10.4", + "const-oid", "crypto-common", "subtle", ] @@ -1694,7 +2128,7 @@ version = "0.1.1" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "0892a17df262a24294c382f0d5997571006e7a4348b4327557c4ff1cd4a8bccc" dependencies = [ - "darling", + "darling 0.20.10", "either", "heck 0.5.0", "proc-macro2", @@ -1702,11 +2136,54 @@ dependencies = [ "syn 2.0.106", ] +[[package]] +name = "dunce" +version = "1.0.5" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "92773504d58c093f6de2459af4af33faa518c13451eb8f2b5698ed3d36e7c813" + +[[package]] +name = "ecdsa" +version = "0.16.9" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "ee27f32b5c5292967d2d4a9d7f1e0b0aed2c15daded5a60300e4abb9d8020bca" +dependencies = [ + "der", + "digest 0.10.7", + "elliptic-curve", + "rfc6979", + "serdect", + "signature", +] + [[package]] name = "either" -version = "1.13.0" +version = "1.15.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "48c757948c5ede0e46177b7add2e67155f70e33c07fea8284df6576da70b3719" +dependencies = [ + "serde", +] + +[[package]] +name = "elliptic-curve" +version = "0.13.8" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "60b1af1c220855b6ceac025d3f6ecdd2b7c4894bfe9cd9bda4fbb4bc7c0d4cf0" +checksum = "b5e6043086bf7973472e0c7dff2142ea0b680d30e18d9cc40f267efbf222bd47" +dependencies = [ + "base16ct", + "crypto-bigint", + "digest 0.10.7", + "ff", + "generic-array", + "group", + "pkcs8", + "rand_core 0.6.4", + "sec1", + "serdect", + "subtle", + "zeroize", +] [[package]] name = "embedded-io" @@ -1865,6 +2342,16 @@ version = "2.1.0" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "9fc0510504f03c51ada170672ac806f1f105a88aa97a5281117e1ddc3368e51a" +[[package]] +name = "ff" +version = "0.13.1" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "c0b50bfb653653f9ca9095b427bed08ab8d75a137839d9ad64eb11810d5b6393" +dependencies = [ + "rand_core 0.6.4", + "subtle", +] + [[package]] name = "find-msvc-tools" version = "0.1.4" @@ -1933,6 +2420,12 @@ version = "0.1.5" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "d9c4f5dac5e15c24eb999c26181a6ca40b39fe946cbe4c263c7209467bc83af2" +[[package]] +name = "foldhash" +version = "0.2.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "77ce24cb58228fbb8aa041425bb1050850ac19177686ea6e0f41a70416f56fdb" + [[package]] name = "foreign-types" version = "0.3.2" @@ -2104,6 +2597,7 @@ checksum = "85649ca51fd72272d7821adaf274ad91c288277713d9c18820d8499a7ff69e9a" dependencies = [ "typenum", "version_check", + "zeroize", ] [[package]] @@ -2168,6 +2662,12 @@ dependencies = [ "time", ] +[[package]] +name = "glob" +version = "0.3.3" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "0cc23270f6e1808e30a928bdc84dea0b9b4136a8bc82338574f23baf47bbd280" + [[package]] name = "globset" version = "0.4.16" @@ -2208,6 +2708,7 @@ version = "0.36.0" dependencies = [ "Inflector", "ahash", + "alloy", "anyhow", "arrow", "arrow-flight", @@ -2562,7 +3063,7 @@ dependencies = [ "graph-runtime-wasm", "graph-server-index-node", "graph-store-postgres", - "secp256k1", + "secp256k1 0.21.3", "serde", "serde_yaml", "slog", @@ -2653,6 +3154,17 @@ dependencies = [ "serde_with", ] +[[package]] +name = "group" +version = "0.13.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "f0f9ef7462f7c099f518d754361858f86d8a07af53ba9af0fe635bbccb151a63" +dependencies = [ + "ff", + "rand_core 0.6.4", + "subtle", +] + [[package]] name = "h2" version = "0.3.26" @@ -2731,10 +3243,21 @@ checksum = "bf151400ff0baff5465007dd2f3e717f3fe502074ca563069ce3a6629d07b289" dependencies = [ "allocator-api2", "equivalent", - "foldhash", + "foldhash 0.1.5", "serde", ] +[[package]] +name = "hashbrown" +version = "0.16.1" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "841d1cc9bed7f9236f321df977030373f4a4163ae1a7dbfe1a51a2c1a51d9100" +dependencies = [ + "foldhash 0.2.0", + "serde", + "serde_core", +] + [[package]] name = "hdrhistogram" version = "7.5.4" @@ -2802,6 +3325,15 @@ version = "0.4.3" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "7f24254aa9a54b5c858eaee2f5bccdb46aaf0e486a595ed5fd8f86ba55232a70" +[[package]] +name = "hex-conservative" +version = "0.2.1" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "5313b072ce3c597065a808dbf612c4c8e8590bdbf8b579508bf7a762c5eae6cd" +dependencies = [ + "arrayvec 0.7.4", +] + [[package]] name = "hex-literal" version = "0.3.4" @@ -3524,6 +4056,19 @@ dependencies = [ "tracing", ] +[[package]] +name = "k256" +version = "0.13.4" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "f6e3919bbaa2945715f0bb6d3934a173d1e9a59ac23767fbaaef277265a7411b" +dependencies = [ + "cfg-if 1.0.0", + "ecdsa", + "elliptic-curve", + "serdect", + "sha2", +] + [[package]] name = "keccak" version = "0.1.5" @@ -3722,6 +4267,17 @@ dependencies = [ "libc", ] +[[package]] +name = "macro-string" +version = "0.1.4" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "1b27834086c65ec3f9387b096d66e99f221cf081c2b738042aa252bcd41204e3" +dependencies = [ + "proc-macro2", + "quote", + "syn 2.0.106", +] + [[package]] name = "maplit" version = "1.0.2" @@ -4028,6 +4584,18 @@ dependencies = [ "libc", ] +[[package]] +name = "nybbles" +version = "0.4.6" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "2c4b5ecbd0beec843101bffe848217f770e8b8da81d8355b7d6e226f2199b3dc" +dependencies = [ + "cfg-if 1.0.0", + "ruint", + "serde", + "smallvec", +] + [[package]] name = "object" version = "0.36.7" @@ -4352,6 +4920,16 @@ version = "0.1.0" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "8b870d8c151b6f2fb93e84a13146138f05d02ed11c7e7c54f8826aaaf7c9f184" +[[package]] +name = "pkcs8" +version = "0.10.2" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "f950b2377845cebe5cf8b5165cb3cc1a5e0fa5cfa3e1f7f55707d8fd82e0a7b7" +dependencies = [ + "der", + "spki", +] + [[package]] name = "pkg-config" version = "0.3.30" @@ -4525,6 +5103,28 @@ dependencies = [ "toml_edit 0.21.1", ] +[[package]] +name = "proc-macro-error-attr2" +version = "2.0.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "96de42df36bb9bba5542fe9f1a054b8cc87e172759a1868aa05c1f3acc89dfc5" +dependencies = [ + "proc-macro2", + "quote", +] + +[[package]] +name = "proc-macro-error2" +version = "2.0.1" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "11ec05c52be0a07b08061f7dd003e7d7092e0472bc731b4af7bb1ef876109802" +dependencies = [ + "proc-macro-error-attr2", + "proc-macro2", + "quote", + "syn 2.0.106", +] + [[package]] name = "proc-macro-utils" version = "0.10.0" @@ -4562,6 +5162,20 @@ dependencies = [ "thiserror 2.0.16", ] +[[package]] +name = "proptest" +version = "1.9.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "bee689443a2bd0a16ab0348b52ee43e3b2d1b1f931c8aa5c9f8de4c86fbe8c40" +dependencies = [ + "bitflags 2.9.0", + "num-traits", + "rand 0.9.2", + "rand_chacha 0.9.0", + "rand_xorshift", + "unarray", +] + [[package]] name = "prost" version = "0.13.5" @@ -4762,6 +5376,7 @@ dependencies = [ "libc", "rand_chacha 0.3.1", "rand_core 0.6.4", + "serde", ] [[package]] @@ -4772,6 +5387,7 @@ checksum = "6db2770f06117d490610c7488547d543617b21bfa07796d7a12f6f1bd53850d1" dependencies = [ "rand_chacha 0.9.0", "rand_core 0.9.3", + "serde", ] [[package]] @@ -4810,6 +5426,16 @@ source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "99d9a13982dcf210057a8a78572b2217b667c3beacbf3a0d8b454f6f82837d38" dependencies = [ "getrandom 0.3.1", + "serde", +] + +[[package]] +name = "rand_xorshift" +version = "0.4.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "513962919efc330f829edb2535844d1b912b0fbe2ca165d613e4e8788bb05a5a" +dependencies = [ + "rand_core 0.9.3", ] [[package]] @@ -4998,6 +5624,16 @@ dependencies = [ "web-sys", ] +[[package]] +name = "rfc6979" +version = "0.4.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "f8dd2a808d456c4a54e300a23e9f5a67e122c3024119acbfd73e3bf664491cb2" +dependencies = [ + "hmac", + "subtle", +] + [[package]] name = "ring" version = "0.17.13" @@ -5022,6 +5658,28 @@ dependencies = [ "rustc-hex", ] +[[package]] +name = "ruint" +version = "1.17.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "a68df0380e5c9d20ce49534f292a36a7514ae21350726efe1865bdb1fa91d278" +dependencies = [ + "alloy-rlp", + "proptest", + "rand 0.8.5", + "rand 0.9.2", + "ruint-macro", + "serde_core", + "valuable", + "zeroize", +] + +[[package]] +name = "ruint-macro" +version = "1.2.1" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "48fd7bd8a6377e15ad9d42a8ec25371b94ddc67abe7c8b9127bec79bebaaae18" + [[package]] name = "rustc-demangle" version = "0.1.24" @@ -5193,13 +5851,40 @@ version = "1.2.0" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "94143f37725109f92c262ed2cf5e59bce7498c01bcc1502d7b9afe439a4e9f49" +[[package]] +name = "sec1" +version = "0.7.3" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "d3e97a565f76233a6003f9f5c54be1d9c5bdfa3eccfb189469f11ec4901c47dc" +dependencies = [ + "base16ct", + "der", + "generic-array", + "pkcs8", + "serdect", + "subtle", + "zeroize", +] + [[package]] name = "secp256k1" version = "0.21.3" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "9c42e6f1735c5f00f51e43e28d6634141f2bcad10931b2609ddd74a86d751260" dependencies = [ - "secp256k1-sys", + "secp256k1-sys 0.4.2", +] + +[[package]] +name = "secp256k1" +version = "0.30.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "b50c5943d326858130af85e049f2661ba3c78b26589b8ab98e65e80ae44a1252" +dependencies = [ + "bitcoin_hashes", + "rand 0.8.5", + "secp256k1-sys 0.10.1", + "serde", ] [[package]] @@ -5211,6 +5896,15 @@ dependencies = [ "cc", ] +[[package]] +name = "secp256k1-sys" +version = "0.10.1" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "d4387882333d3aa8cb20530a17c69a3752e97837832f34f6dccc760e715001d9" +dependencies = [ + "cc", +] + [[package]] name = "security-framework" version = "2.11.0" @@ -5381,7 +6075,7 @@ version = "3.12.0" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "8d00caa5193a3c8362ac2b73be6b9e768aa5a4b2f721d8f4b339600c3cb51f8e" dependencies = [ - "darling", + "darling 0.20.10", "proc-macro2", "quote", "syn 2.0.106", @@ -5400,6 +6094,16 @@ dependencies = [ "unsafe-libyaml", ] +[[package]] +name = "serdect" +version = "0.2.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "a84f14a19e9a014bb9f4512488d9829a68e04ecabffb0f9904cd1ace94598177" +dependencies = [ + "base16ct", + "serde", +] + [[package]] name = "sha-1" version = "0.9.8" @@ -5475,6 +6179,16 @@ dependencies = [ "libc", ] +[[package]] +name = "signature" +version = "2.2.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "77549399552de45a898a580c1b41d445bf730df867cc44e6c0233bbc4b8329de" +dependencies = [ + "digest 0.10.7", + "rand_core 0.6.4", +] + [[package]] name = "simdutf8" version = "0.1.5" @@ -5614,6 +6328,16 @@ version = "0.9.8" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "6980e8d7511241f8acf4aebddbb1ff938df5eebe98691418c4468d0b72a96a67" +[[package]] +name = "spki" +version = "0.7.3" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "d91ed6c858b01f942cd56b37a94b3e0a1798290327d1236e4d9cf4eaca44d29d" +dependencies = [ + "base64ct", + "der", +] + [[package]] name = "sptr" version = "0.3.2" @@ -5858,6 +6582,18 @@ dependencies = [ "unicode-ident", ] +[[package]] +name = "syn-solidity" +version = "1.4.1" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "ff790eb176cc81bb8936aed0f7b9f14fc4670069a2d371b3e3b0ecce908b2cb3" +dependencies = [ + "paste", + "proc-macro2", + "quote", + "syn 2.0.106", +] + [[package]] name = "sync_wrapper" version = "0.1.2" @@ -6032,6 +6768,15 @@ dependencies = [ "once_cell", ] +[[package]] +name = "threadpool" +version = "1.8.1" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "d050e60b33d41c19108b32cea32164033a9013fe3b46cbd4457559bfbf77afaa" +dependencies = [ + "num_cpus", +] + [[package]] name = "time" version = "0.3.36" @@ -6631,6 +7376,12 @@ dependencies = [ "static_assertions", ] +[[package]] +name = "unarray" +version = "0.1.4" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "eaea85b334db583fe3274d12b4cd1880032beab409c0d774be044d4480ab9a94" + [[package]] name = "unicase" version = "2.7.0" @@ -6757,6 +7508,12 @@ version = "1.15.1" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "e0f540e3240398cce6128b64ba83fdbdd86129c16a3aa1a3a252efd66eb3d587" +[[package]] +name = "valuable" +version = "0.1.1" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "ba73ea9cf16a25df0c8caa16c51acb937d5712a8429db78a3ee29d5dcacd3a65" + [[package]] name = "vcpkg" version = "0.2.15" @@ -7285,7 +8042,7 @@ dependencies = [ "pin-project", "reqwest", "rlp", - "secp256k1", + "secp256k1 0.21.3", "serde", "serde_json", "soketto", @@ -7831,6 +8588,20 @@ name = "zeroize" version = "1.8.1" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "ced3678a2879b30306d323f4542626697a464a97c0a07c9aebf7ebca65cd4dde" +dependencies = [ + "zeroize_derive", +] + +[[package]] +name = "zeroize_derive" +version = "1.4.2" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "ce36e65b0d2999d2aafac989fb249189a141aee1f53c612c1f37d72631959f69" +dependencies = [ + "proc-macro2", + "quote", + "syn 2.0.106", +] [[package]] name = "zerovec" diff --git a/Cargo.toml b/Cargo.toml index 0845798240e..10c8ee75cf4 100644 --- a/Cargo.toml +++ b/Cargo.toml @@ -100,6 +100,7 @@ rand = { version = "0.9.2", features = ["os_rng"] } # Dependencies related to Nozzle Subgraphs ahash = "0.8.11" +alloy = { version = "1.0.12", default-features = false, features = ["json-abi", "serde"] } arrow = { version = "=55.0.0" } arrow-flight = { version = "=55.0.0", features = ["flight-sql-experimental"] } lazy-regex = "3.4.1" diff --git a/graph/Cargo.toml b/graph/Cargo.toml index 23df37d0fd1..e1030f27010 100644 --- a/graph/Cargo.toml +++ b/graph/Cargo.toml @@ -105,6 +105,7 @@ object_store = { version = "0.12.3", features = ["gcp"] } # Dependencies related to Nozzle Subgraphs ahash.workspace = true +alloy.workspace = true arrow-flight.workspace = true arrow.workspace = true lazy-regex.workspace = true diff --git a/graph/src/nozzle/common/mod.rs b/graph/src/nozzle/common/mod.rs new file mode 100644 index 00000000000..f4fadb33d08 --- /dev/null +++ b/graph/src/nozzle/common/mod.rs @@ -0,0 +1,4 @@ +pub(super) mod column_aliases { + pub(in crate::nozzle) static BLOCK_NUMBER: &[&str] = &["_block_num", "block_num"]; + pub(in crate::nozzle) static BLOCK_HASH: &[&str] = &["hash", "block_hash"]; +} diff --git a/graph/src/nozzle/mod.rs b/graph/src/nozzle/mod.rs index bd90587fbb1..9a8f5817c0e 100644 --- a/graph/src/nozzle/mod.rs +++ b/graph/src/nozzle/mod.rs @@ -1,7 +1,9 @@ //! This module contains the functionality required to support Nozzle Subgraphs. pub mod client; +pub mod common; pub mod error; pub mod log; +pub mod stream_aggregator; pub use self::client::{flight_client::FlightClient, Client}; diff --git a/graph/src/nozzle/stream_aggregator/error.rs b/graph/src/nozzle/stream_aggregator/error.rs new file mode 100644 index 00000000000..187e1c48506 --- /dev/null +++ b/graph/src/nozzle/stream_aggregator/error.rs @@ -0,0 +1,49 @@ +use thiserror::Error; + +use crate::nozzle::error::IsDeterministic; + +#[derive(Debug, Error)] +pub enum Error { + #[error("failed to aggregate record batches: {0:#}")] + Aggregation(#[source] anyhow::Error), + + #[error("failed to buffer record batches from stream {stream_index}: {source:#}")] + Buffer { + stream_index: usize, + source: anyhow::Error, + }, + + #[error("failed to read record batch from stream {stream_index}: {source:#}")] + Stream { + stream_index: usize, + source: anyhow::Error, + is_deterministic: bool, + }, +} + +impl Error { + pub(super) fn stream(stream_index: usize, e: E) -> Self + where + E: std::error::Error + IsDeterministic + Send + Sync + 'static, + { + let is_deterministic = e.is_deterministic(); + + Self::Stream { + stream_index, + source: anyhow::Error::from(e), + is_deterministic, + } + } +} + +impl IsDeterministic for Error { + fn is_deterministic(&self) -> bool { + match self { + Self::Aggregation(_) => true, + Self::Buffer { .. } => true, + Self::Stream { + is_deterministic, .. + } => *is_deterministic, + } + } +} diff --git a/graph/src/nozzle/stream_aggregator/mod.rs b/graph/src/nozzle/stream_aggregator/mod.rs new file mode 100644 index 00000000000..4ec9af8f4ba --- /dev/null +++ b/graph/src/nozzle/stream_aggregator/mod.rs @@ -0,0 +1,198 @@ +mod error; +mod record_batch; + +use std::{ + pin::Pin, + task::{self, Poll}, +}; + +use anyhow::{anyhow, Result}; +use arrow::array::RecordBatch; +use futures03::{stream::BoxStream, Stream, StreamExt, TryStreamExt}; +use slog::{debug, info, Logger}; + +use self::record_batch::Buffer; +use crate::nozzle::{error::IsDeterministic, log::Logger as _}; + +pub use self::{ + error::Error, + record_batch::{RecordBatchGroup, RecordBatchGroups, StreamRecordBatch}, +}; + +/// Reads record batches from multiple streams and groups them by block number and hash pairs. +/// +/// Processes each row in the response record batches and groups them by block number +/// and hash. When processing starts for a new block, all data from previous blocks +/// is grouped and streamed in batches. +/// +/// The reason the aggregation is required is to ensure compatibility with the existing +/// Subgraph storage implementation. +/// +/// # Stream requirements +/// +/// - Every record batch must have valid block number and hash columns +/// - Every record batch must contain blocks in ascending order +/// +/// # Performance +/// +/// To ensure data consistency and ordered output, the aggregator waits for slower streams +/// to catch up with faster streams. The output stream speed matches the slowest input stream. +pub struct StreamAggregator { + streams: Vec>>, + buffer: Buffer, + logger: Logger, + is_finalized: bool, + is_failed: bool, +} + +impl StreamAggregator { + /// Creates a new stream aggregator from the `streams` with a bounded buffer. + pub fn new( + logger: &Logger, + streams: impl IntoIterator>>, + max_buffer_size: usize, + ) -> Self + where + E: std::error::Error + IsDeterministic + Send + Sync + 'static, + { + let logger = logger.component("nozzle::StreamAggregator"); + + let streams = streams + .into_iter() + .enumerate() + .map(|(stream_index, stream)| { + stream + .map_err(move |e| Error::stream(stream_index, e)) + .boxed() + }) + .collect::>(); + + let num_streams = streams.len(); + + info!(logger, "Initializing stream aggregator"; + "num_streams" => num_streams, + "max_buffer_size" => max_buffer_size + ); + + Self { + streams, + buffer: Buffer::new(num_streams, max_buffer_size), + logger, + is_finalized: false, + is_failed: false, + } + } + + fn poll_all_streams( + &mut self, + cx: &mut task::Context<'_>, + ) -> Poll>> { + let mut made_progress = false; + + for (stream_index, stream) in self.streams.iter_mut().enumerate() { + if self.buffer.is_finalized(stream_index) { + continue; + } + + if self.buffer.is_blocked(stream_index) { + self.is_failed = true; + + return Poll::Ready(Some(Err(Error::Buffer { + stream_index, + source: anyhow!("buffer is blocked"), + }))); + } + + if !self.buffer.has_capacity(stream_index) { + continue; + } + + match stream.poll_next_unpin(cx) { + Poll::Ready(Some(Ok(record_batch))) if record_batch.num_rows() != 0 => { + let buffer_result = + self.buffer + .extend(stream_index, record_batch) + .map_err(|e| Error::Buffer { + stream_index, + source: e, + }); + + match buffer_result { + Ok(()) => { + made_progress = true; + + debug!(self.logger, "Buffered record batch"; + "stream_index" => stream_index, + "buffer_size" => self.buffer.size(stream_index), + "has_capacity" => self.buffer.has_capacity(stream_index) + ); + } + Err(e) => { + self.is_failed = true; + + return Poll::Ready(Some(Err(e))); + } + } + } + Poll::Ready(Some(Ok(_empty_record_batch))) => { + debug!(self.logger, "Received an empty record batch"; + "stream_index" => stream_index + ); + } + Poll::Ready(Some(Err(e))) => { + self.is_failed = true; + + return Poll::Ready(Some(Err(e))); + } + Poll::Ready(None) => { + self.buffer.finalize(stream_index); + + if self.buffer.all_finalized() { + self.is_finalized = true; + } + + made_progress = true; + + info!(self.logger, "Stream completed"; + "stream_index" => stream_index, + "buffer_size" => self.buffer.size(stream_index) + ); + } + Poll::Pending => { + // + } + } + } + + if made_progress { + if let Some(completed_groups) = + self.buffer.completed_groups().map_err(Error::Aggregation)? + { + debug!(self.logger, "Sending completed record batch groups"; + "num_completed_groups" => completed_groups.len() + ); + + return Poll::Ready(Some(Ok(completed_groups))); + } + } + + if self.is_finalized { + info!(self.logger, "All streams completed"); + return Poll::Ready(None); + } + + Poll::Pending + } +} + +impl Stream for StreamAggregator { + type Item = Result; + + fn poll_next(mut self: Pin<&mut Self>, cx: &mut task::Context<'_>) -> Poll> { + if self.is_finalized || self.is_failed { + return Poll::Ready(None); + } + + self.poll_all_streams(cx) + } +} diff --git a/graph/src/nozzle/stream_aggregator/record_batch/aggregator.rs b/graph/src/nozzle/stream_aggregator/record_batch/aggregator.rs new file mode 100644 index 00000000000..f513a2752ed --- /dev/null +++ b/graph/src/nozzle/stream_aggregator/record_batch/aggregator.rs @@ -0,0 +1,230 @@ +use std::{ + collections::{btree_map::Entry, BTreeMap, HashSet}, + sync::{Arc, Weak}, +}; + +use alloy::primitives::{BlockHash, BlockNumber}; +use anyhow::{bail, Context, Result}; +use arrow::array::RecordBatch; + +use super::{Decoder, GroupData}; +use crate::cheap_clone::CheapClone; + +/// Groups record batches by block number and hash pairs. +/// +/// This aggregator collects and organizes record batches based on their +/// associated block identifiers. +pub(super) struct Aggregator { + buffer: BTreeMap<(BlockNumber, BlockHash), GroupData>, + buffered_record_batches: Vec>, + is_finalized: bool, +} + +impl Aggregator { + /// Creates a new empty aggregator. + pub(super) fn new() -> Self { + Self { + buffer: BTreeMap::new(), + buffered_record_batches: Vec::new(), + is_finalized: false, + } + } + + /// Extends this aggregator with data from a new `record_batch`. + /// + /// Processes each row in the `record_batch` and groups them by block number + /// and hash. Each unique block is stored in the internal buffer with references + /// to all rows that belong to that block. + /// + /// # Errors + /// + /// Returns an error if: + /// - `record_batch` does not contain block numbers or hashes + /// - `record_batch` contains invalid block numbers or hashes + /// - `record_batch` data is not ordered + /// - `record_batch` data is not consistent + /// + /// The returned error is deterministic. + /// + /// # Panics + /// + /// Panics if this aggregator has already been finalized. + pub(super) fn extend(&mut self, record_batch: RecordBatch) -> Result<()> { + assert!(!self.is_finalized); + + let record_batch = Arc::new(record_batch); + let decoder = Decoder::new(&record_batch)?; + + self.buffered_record_batches + .push(Arc::downgrade(&record_batch)); + + let num_rows = record_batch.num_rows(); + let mut record_batch_buffered: HashSet<(BlockNumber, BlockHash)> = + HashSet::with_capacity(num_rows); + + for row_index in 0..num_rows { + let err_cx = || format!("invalid group data at row {row_index}"); + let block_number = decoder.block_number(row_index).with_context(err_cx)?; + let block_hash = decoder.block_hash(row_index).with_context(err_cx)?; + let block_ptr = (block_number, block_hash); + + self.ensure_incremental_update(&block_ptr) + .with_context(err_cx)?; + + match self.buffer.entry(block_ptr) { + Entry::Vacant(entry) => { + entry.insert(GroupData::new(record_batch.cheap_clone(), row_index)); + record_batch_buffered.insert(block_ptr); + } + Entry::Occupied(mut entry) => { + let group_data = entry.get_mut(); + + if !record_batch_buffered.contains(&block_ptr) { + group_data.add(record_batch.cheap_clone(), row_index); + record_batch_buffered.insert(block_ptr); + } else { + group_data.add_row_index(row_index); + } + } + } + } + + Ok(()) + } + + /// Returns the block number and hash pair for the most recent completed group. + /// + /// A group is considered complete when: + /// - There is a group with a higher block number in the internal buffer + /// - This aggregator is finalized + /// + /// Any group in this aggregator with a lower block number than the one returned by + /// this method is also considered complete. + pub(super) fn max_completed_block_ptr(&self) -> Option<&(BlockNumber, BlockHash)> { + let mut iter = self.buffer.keys().rev(); + + if self.is_finalized { + return iter.next(); + } + + iter.skip(1).next() + } + + /// Returns `true` if this aggregator contains completed groups. + /// + /// A group is considered complete when: + /// - There is a group with a higher block number in the internal buffer + /// - This aggregator is finalized + pub(super) fn has_completed_groups(&self) -> bool { + (self.is_finalized && !self.buffer.is_empty()) || self.buffer.len() > 1 + } + + /// Removes and returns completed groups from this aggregator up to `max_block_ptr`. + /// + /// # Errors + /// + /// Returns an error if groups cannot be converted into record batches. + /// + /// The returned error is deterministic. + /// + /// # Panics + /// + /// Panics if `max_block_ptr` is greater than the most recent completed block in this aggregator. + pub(super) fn completed_groups( + &mut self, + max_block_ptr: &(BlockNumber, BlockHash), + ) -> Result>> { + if self.buffer.is_empty() { + return Ok(None); + } + + let Some(max_completed_block_ptr) = self.max_completed_block_ptr() else { + return Ok(None); + }; + + assert!(max_block_ptr <= max_completed_block_ptr); + let incomplete_groups = self.buffer.split_off(max_block_ptr); + let mut completed_groups = std::mem::replace(&mut self.buffer, incomplete_groups); + + if let Some((block_ptr, _)) = self.buffer.first_key_value() { + if block_ptr == max_block_ptr { + let (block_ptr, group_data) = self.buffer.pop_first().unwrap(); + completed_groups.insert(block_ptr, group_data); + } + } + + if completed_groups.is_empty() { + return Ok(None); + } + + let completed_groups = completed_groups + .into_iter() + .map(|(block_ptr, group_data)| Ok((block_ptr, group_data.into_record_batch()?))) + .collect::>>()?; + + self.buffered_record_batches + .retain(|weak_ref| weak_ref.strong_count() > 0); + + Ok(Some(completed_groups)) + } + + /// Marks this aggregator as finalized. + /// + /// A finalized aggregator cannot be extended. + pub(super) fn finalize(&mut self) { + self.is_finalized = true; + } + + /// Returns `true` if this aggregator is finalized. + pub(super) fn is_finalized(&self) -> bool { + self.is_finalized + } + + /// Returns the number of record batches that this aggregator holds strong references to. + pub(super) fn len(&self) -> usize { + self.buffered_record_batches + .iter() + .filter(|weak_ref| weak_ref.strong_count() > 0) + .count() + } + + /// Ensures that block updates arrive in sequential order. + /// + /// Validates that the provided block number and hash represent a valid + /// incremental update relative to the last block in the buffer. + /// + /// # Errors + /// + /// Returns an error if: + /// - The block number is less than the maximum stored block number + /// - The block number equals the maximum but has a different hash + /// + /// The returned error is deterministic. + /// + /// # Note + /// + /// Potential reorgs are not handled at this level and are + /// treated as data corruption. + fn ensure_incremental_update( + &self, + (block_number, block_hash): &(BlockNumber, BlockHash), + ) -> Result<()> { + let Some(((max_block_number, max_block_hash), _)) = self.buffer.last_key_value() else { + return Ok(()); + }; + + if block_number < max_block_number { + bail!("received block number {block_number} after {max_block_number}"); + } + + if block_number == max_block_number && block_hash != max_block_hash { + bail!( + "received block hash '0x{}' after '0x{}' for block number {block_number}", + hex::encode(&block_hash), + hex::encode(&max_block_hash) + ); + } + + Ok(()) + } +} diff --git a/graph/src/nozzle/stream_aggregator/record_batch/buffer.rs b/graph/src/nozzle/stream_aggregator/record_batch/buffer.rs new file mode 100644 index 00000000000..4b45680636c --- /dev/null +++ b/graph/src/nozzle/stream_aggregator/record_batch/buffer.rs @@ -0,0 +1,209 @@ +use std::collections::{btree_map::Entry, BTreeMap}; + +use alloy::primitives::{BlockHash, BlockNumber}; +use anyhow::{bail, Result}; +use arrow::array::RecordBatch; + +use super::{Aggregator, RecordBatchGroup, RecordBatchGroups, StreamRecordBatch}; + +/// Buffers record batches from multiple streams in memory and creates +/// groups of record batches by block number and hash pairs. +pub(in super::super) struct Buffer { + aggregators: Vec, + num_streams: usize, + max_buffer_size: usize, +} + +impl Buffer { + /// Creates a new buffer that can handle exactly `num_streams` number of streams. + /// + /// Creates a new associated `Aggregator` for each stream. + /// The `max_buffer_size` specifies how many record batches for each stream can be buffered at most. + pub(in super::super) fn new(num_streams: usize, max_buffer_size: usize) -> Self { + let aggregators = (0..num_streams).map(|_| Aggregator::new()).collect(); + + Self { + aggregators, + num_streams, + max_buffer_size, + } + } + + /// Extends the aggregator for `stream_index` with data from a new `record_batch`. + /// + /// # Errors + /// + /// Errors if the aggregator cannot be extended. + /// + /// The returned error is deterministic. + /// + /// # Panics + /// + /// Panics if the `stream_index` is greater than the initialized number of streams. + pub(in super::super) fn extend( + &mut self, + stream_index: usize, + record_batch: RecordBatch, + ) -> Result<()> { + assert!(stream_index < self.num_streams); + self.aggregators[stream_index].extend(record_batch) + } + + /// Removes and returns all completed groups from this buffer. + /// + /// # Errors + /// + /// Errors if aggregators fail to return completed groups. + /// + /// The returned error is deterministic. + /// + /// # Panics + /// + /// Panics if aggregators return inconsistent responses. + pub(in super::super) fn completed_groups(&mut self) -> Result> { + let Some(max_completed_block_ptr) = self.max_completed_block_ptr()? else { + return Ok(None); + }; + + let mut ordered_completed_groups = BTreeMap::new(); + + for (stream_index, agg) in self.aggregators.iter_mut().enumerate() { + let Some(completed_groups) = agg.completed_groups(&max_completed_block_ptr)? else { + continue; + }; + + for (block_ptr, record_batch) in completed_groups { + match ordered_completed_groups.entry(block_ptr) { + Entry::Vacant(entry) => { + entry.insert(RecordBatchGroup { + record_batches: vec![StreamRecordBatch { + stream_index, + record_batch, + }], + }); + } + Entry::Occupied(mut entry) => { + entry.get_mut().record_batches.push(StreamRecordBatch { + stream_index, + record_batch, + }); + } + } + } + } + + assert!(!ordered_completed_groups.is_empty()); + Ok(Some(ordered_completed_groups)) + } + + /// Marks the aggregator for the `stream_index` as finalized. + /// + /// A finalized aggregator cannot be extended. + /// + /// # Panics + /// + /// Panics if the `stream_index` is greater than the initialized number of streams. + pub(in super::super) fn finalize(&mut self, stream_index: usize) { + assert!(stream_index < self.num_streams); + self.aggregators[stream_index].finalize(); + } + + /// Returns `true` if the aggregator for `stream_index` is finalized. + /// + /// # Panics + /// + /// Panics if the `stream_index` is greater than the initialized number of streams. + pub(in super::super) fn is_finalized(&self, stream_index: usize) -> bool { + assert!(stream_index < self.num_streams); + self.aggregators[stream_index].is_finalized() + } + + /// Returns `true` if all aggregators are finalized. + pub(in super::super) fn all_finalized(&self) -> bool { + self.aggregators.iter().all(|agg| agg.is_finalized()) + } + + /// Returns `true` if the aggregator for `stream_index` can be extended. + /// + /// # Panics + /// + /// Panics if the `stream_index` is greater than the initialized number of streams. + pub(in super::super) fn has_capacity(&self, stream_index: usize) -> bool { + assert!(stream_index < self.num_streams); + self.aggregators[stream_index].len() < self.max_buffer_size + } + + /// Returns `true` if the stream `stream_index` is not allowed to make progress and + /// its aggregator does not contain any completed groups. + /// + /// # Panics + /// + /// Panics if the `stream_index` is greater than the initialized number of streams. + pub(in super::super) fn is_blocked(&self, stream_index: usize) -> bool { + !self.has_capacity(stream_index) + && !self.is_finalized(stream_index) + && !self.aggregators[stream_index].has_completed_groups() + } + + /// Returns the number of record batches stream `stream_index` has buffered. + /// + /// # Panics + /// + /// Panics if the `stream_index` is greater than the initialized number of streams. + pub(in super::super) fn size(&self, stream_index: usize) -> usize { + assert!(stream_index < self.num_streams); + self.aggregators[stream_index].len() + } + + /// Returns the block number and hash pair for the most recent completed group across all streams. + /// + /// Finds the highest block number that all streams have completed. This ensures + /// slower streams can still produce valid completed groups without skipping any groups. + /// The function returns the minimum of all maximum completed blocks to maintain consistency. + /// + /// # Errors + /// + /// Returns an error if multiple streams return the same block number but different hashes. + /// + /// The returned error is deterministic. + /// + /// # Note + /// + /// Potential reorgs are not handled at this level and are treated as data corruption. + fn max_completed_block_ptr(&self) -> Result> { + let mut max_completed_block_ptrs: BTreeMap<&BlockNumber, &BlockHash> = BTreeMap::new(); + + for (stream_index, agg) in self.aggregators.iter().enumerate() { + let Some((max_completed_block_number, max_completed_block_hash)) = + agg.max_completed_block_ptr() + else { + if !agg.is_finalized() { + return Ok(None); + } + + continue; + }; + + match max_completed_block_ptrs.entry(max_completed_block_number) { + Entry::Vacant(entry) => { + entry.insert(max_completed_block_hash); + } + Entry::Occupied(entry) => { + if *entry.get() != max_completed_block_hash { + bail!("aggregated data is corrupted: stream {} produced block hash '0x{}' for block {}, but a previous stream set the block hash to '0x{}'", + stream_index, + hex::encode(max_completed_block_hash), + max_completed_block_number, + hex::encode(entry.get()), + ); + } + } + }; + } + + Ok(max_completed_block_ptrs + .into_iter() + .next() + .map(|(block_number, block_hash)| (*block_number, *block_hash))) + } +} diff --git a/graph/src/nozzle/stream_aggregator/record_batch/decoder.rs b/graph/src/nozzle/stream_aggregator/record_batch/decoder.rs new file mode 100644 index 00000000000..5c2d69c697f --- /dev/null +++ b/graph/src/nozzle/stream_aggregator/record_batch/decoder.rs @@ -0,0 +1,94 @@ +use alloy::primitives::{BlockHash, BlockNumber}; +use anyhow::{bail, Context, Result}; +use arrow::array::{Array, FixedSizeBinaryArray, RecordBatch, UInt64Array}; + +use crate::nozzle::common::column_aliases; + +/// Decodes the data required for stream aggregation. +pub(super) struct Decoder<'a> { + /// Block numbers serve as group keys for related record batches. + block_number_column: &'a UInt64Array, + + /// Block hashes ensure data consistency across tables and datasets. + block_hash_column: &'a FixedSizeBinaryArray, +} + +impl<'a> Decoder<'a> { + /// Constructs a new decoder for `record_batch`. + /// + /// # Errors + /// + /// Returns an error if: + /// - `record_batch` does not contain valid block number or hash columns + /// + /// The returned error is deterministic. + pub(super) fn new(record_batch: &'a RecordBatch) -> Result { + Ok(Self { + block_number_column: block_number_column(record_batch)?, + block_hash_column: block_hash_column(record_batch)?, + }) + } + + /// Returns the block number at `row_index`. + /// + /// # Errors + /// + /// Returns an error if: + /// - The block number at `row_index` is null + /// + /// The returned error is deterministic. + pub(super) fn block_number(&self, row_index: usize) -> Result { + if self.block_number_column.is_null(row_index) { + bail!("block number is null"); + } + + Ok(self.block_number_column.value(row_index)) + } + + /// Returns the block hash at `row_index`. + /// + /// # Errors + /// + /// Returns an error if: + /// - The block hash at `row_index` is null or invalid + /// + /// The returned error is deterministic. + pub(super) fn block_hash(&self, row_index: usize) -> Result { + if self.block_hash_column.is_null(row_index) { + bail!("block hash is null"); + } + + BlockHash::try_from(self.block_hash_column.value(row_index)) + .context("block hash is invalid") + } +} + +fn block_number_column<'a>(record_batch: &'a RecordBatch) -> Result<&'a UInt64Array> { + for &column_name in column_aliases::BLOCK_NUMBER { + let Some(column) = record_batch.column_by_name(column_name) else { + continue; + }; + + return column + .as_any() + .downcast_ref() + .context("failed to downcast block number column"); + } + + bail!("failed to find block number column"); +} + +fn block_hash_column<'a>(record_batch: &'a RecordBatch) -> Result<&'a FixedSizeBinaryArray> { + for &column_name in column_aliases::BLOCK_HASH { + let Some(column) = record_batch.column_by_name(column_name) else { + continue; + }; + + return column + .as_any() + .downcast_ref() + .context("failed to downcast block hash column"); + } + + bail!("failed to find block hash column"); +} diff --git a/graph/src/nozzle/stream_aggregator/record_batch/group_data.rs b/graph/src/nozzle/stream_aggregator/record_batch/group_data.rs new file mode 100644 index 00000000000..32d3317c585 --- /dev/null +++ b/graph/src/nozzle/stream_aggregator/record_batch/group_data.rs @@ -0,0 +1,88 @@ +use std::sync::Arc; + +use anyhow::{Context, Result}; +use arrow::{ + array::{RecordBatch, UInt64Array}, + compute::{concat_batches, take_record_batch}, +}; + +/// Contains references to all record batches and rows of a group. +pub(super) struct GroupData { + parts: Vec, +} + +struct Part { + record_batch: Arc, + row_indices: Vec, +} + +impl GroupData { + /// Creates a new group with an initial `record_batch` and `row_index`. + pub(super) fn new(record_batch: Arc, row_index: usize) -> Self { + Self { + parts: vec![Part { + record_batch, + row_indices: vec![row_index as u64], + }], + } + } + + /// Adds a new `record_batch` and `row_index` to this group. + pub(super) fn add(&mut self, record_batch: Arc, row_index: usize) { + self.parts.push(Part { + record_batch, + row_indices: vec![row_index as u64], + }) + } + + /// Adds a `row_index` to the most recent record batch in this group. + /// + /// # Panics + /// + /// Panics if this group is empty. + pub(super) fn add_row_index(&mut self, row_index: usize) { + assert!(!self.parts.is_empty()); + + self.parts + .last_mut() + .unwrap() + .row_indices + .push(row_index as u64); + } + + /// Converts this group into a single record batch. + /// + /// Merges all group rows from all record batches together. + /// + /// # Errors + /// + /// Returns an error if the record batches in this group have incompatible types. + /// + /// The returned error is deterministic. + /// + /// # Panics + /// + /// Panics if: + /// - This group is empty + /// - This group contains invalid row indices + pub(super) fn into_record_batch(self) -> Result { + assert!(!self.parts.is_empty()); + + let schema = self.parts[0].record_batch.schema(); + let mut partial_record_batches = Vec::with_capacity(self.parts.len()); + + for part in self.parts { + let Part { + record_batch, + row_indices, + } = part; + + let row_indices = UInt64Array::from(row_indices); + let partial_record_batch = take_record_batch(&record_batch, &row_indices).unwrap(); + + partial_record_batches.push(partial_record_batch); + } + + concat_batches(&schema, &partial_record_batches).context("failed to merge record batches") + } +} diff --git a/graph/src/nozzle/stream_aggregator/record_batch/mod.rs b/graph/src/nozzle/stream_aggregator/record_batch/mod.rs new file mode 100644 index 00000000000..171f360f5fa --- /dev/null +++ b/graph/src/nozzle/stream_aggregator/record_batch/mod.rs @@ -0,0 +1,38 @@ +//! This module handles grouping record batches from multiple streams. +//! +//! # Safety +//! +//! The implementation occasionally uses `assert` and `unwrap` to ensure consistency +//! between related types and methods. +//! +//! This is safe because the functionality is internal and not exposed to other modules. +//! +//! A panic indicates a critical error in the grouping algorithm. + +mod aggregator; +mod buffer; +mod decoder; +mod group_data; + +use std::collections::BTreeMap; + +use alloy::primitives::{BlockHash, BlockNumber}; +use arrow::array::RecordBatch; + +use self::{aggregator::Aggregator, decoder::Decoder, group_data::GroupData}; + +pub(super) use buffer::Buffer; + +/// Maps block number and hash pairs to record batches. +pub type RecordBatchGroups = BTreeMap<(BlockNumber, BlockHash), RecordBatchGroup>; + +/// Contains record batches associated with a specific block number and hash pair. +pub struct RecordBatchGroup { + pub record_batches: Vec, +} + +/// Contains a record batch and the index of its source stream. +pub struct StreamRecordBatch { + pub stream_index: usize, + pub record_batch: RecordBatch, +} From 08d7a21a85704d6cef105376e68c91efaf6aa656 Mon Sep 17 00:00:00 2001 From: Ion Suman <47307091+isum@users.noreply.github.com> Date: Thu, 25 Sep 2025 16:58:07 +0300 Subject: [PATCH 03/40] feat(graph): add Nozzle data decoder --- Cargo.lock | 1 + Cargo.toml | 1 + graph/Cargo.toml | 1 + graph/src/nozzle/codec/array_decoder.rs | 429 ++++++++++++++++++++++ graph/src/nozzle/codec/decoder.rs | 29 ++ graph/src/nozzle/codec/list_decoder.rs | 88 +++++ graph/src/nozzle/codec/mapping_decoder.rs | 32 ++ graph/src/nozzle/codec/mod.rs | 238 ++++++++++++ graph/src/nozzle/codec/name_cache.rs | 77 ++++ graph/src/nozzle/codec/value_decoder.rs | 350 ++++++++++++++++++ graph/src/nozzle/mod.rs | 6 +- 11 files changed, 1251 insertions(+), 1 deletion(-) create mode 100644 graph/src/nozzle/codec/array_decoder.rs create mode 100644 graph/src/nozzle/codec/decoder.rs create mode 100644 graph/src/nozzle/codec/list_decoder.rs create mode 100644 graph/src/nozzle/codec/mapping_decoder.rs create mode 100644 graph/src/nozzle/codec/mod.rs create mode 100644 graph/src/nozzle/codec/name_cache.rs create mode 100644 graph/src/nozzle/codec/value_decoder.rs diff --git a/Cargo.lock b/Cargo.lock index 85ca7d28b55..06b1c636052 100644 --- a/Cargo.lock +++ b/Cargo.lock @@ -2734,6 +2734,7 @@ dependencies = [ "futures 0.3.31", "graph_derive", "graphql-parser", + "heck 0.5.0", "hex", "hex-literal 1.0.0", "http 0.2.12", diff --git a/Cargo.toml b/Cargo.toml index 10c8ee75cf4..b5f2029580c 100644 --- a/Cargo.toml +++ b/Cargo.toml @@ -103,6 +103,7 @@ ahash = "0.8.11" alloy = { version = "1.0.12", default-features = false, features = ["json-abi", "serde"] } arrow = { version = "=55.0.0" } arrow-flight = { version = "=55.0.0", features = ["flight-sql-experimental"] } +heck = "0.5.0" lazy-regex = "3.4.1" # Incremental compilation on Rust 1.58 causes an ICE on build. As soon as graph node builds again, these can be removed. diff --git a/graph/Cargo.toml b/graph/Cargo.toml index e1030f27010..30b68247d16 100644 --- a/graph/Cargo.toml +++ b/graph/Cargo.toml @@ -108,6 +108,7 @@ ahash.workspace = true alloy.workspace = true arrow-flight.workspace = true arrow.workspace = true +heck.workspace = true lazy-regex.workspace = true [dev-dependencies] diff --git a/graph/src/nozzle/codec/array_decoder.rs b/graph/src/nozzle/codec/array_decoder.rs new file mode 100644 index 00000000000..d0f5bf12438 --- /dev/null +++ b/graph/src/nozzle/codec/array_decoder.rs @@ -0,0 +1,429 @@ +use std::{fmt::Display, sync::LazyLock}; + +use alloy::primitives::B256; +use anyhow::{anyhow, Result}; +use arrow::{ + array::{ + timezone::Tz, Array, ArrayAccessor, BinaryArray, BinaryViewArray, BooleanArray, + Decimal128Array, Decimal256Array, FixedSizeBinaryArray, Float16Array, Float32Array, + Float64Array, Int16Array, Int32Array, Int64Array, Int8Array, LargeBinaryArray, + LargeStringArray, PrimitiveArray, StringArray, StringViewArray, TimestampMicrosecondArray, + TimestampMillisecondArray, TimestampNanosecondArray, TimestampSecondArray, UInt16Array, + UInt32Array, UInt64Array, UInt8Array, + }, + datatypes::ArrowTemporalType, +}; +use chrono::{DateTime, Utc}; + +use super::decoder::Decoder; +use crate::data::store::scalar::{BigDecimal, BigInt}; + +/// Decodes Arrow arrays into Subgraph types. +pub(super) struct ArrayDecoder<'a, T: 'static>(&'a T); + +impl<'a, T> ArrayDecoder<'a, T> +where + T: Array + 'static, +{ + /// Creates a new Arrow array decoder. + /// + /// # Errors + /// + /// Returns an error if the `array` cannot be downcasted to type `T`. + /// + /// The returned error is deterministic. + pub(super) fn new(array: &'a dyn Array) -> Result { + Ok(Self(downcast_ref(array)?)) + } +} + +macro_rules! check_value { + ($self:ident, $row_index:ident) => { + if $row_index >= $self.0.len() { + return Ok(None); + } + + if $self.0.is_null($row_index) { + return Ok(None); + } + }; +} + +impl<'a, T> ArrayDecoder<'a, T> +where + &'a T: ArrayAccessor, +{ + fn value( + &'a self, + row_index: usize, + mapping: impl FnOnce(<&'a T as ArrayAccessor>::Item) -> Result, + ) -> Result> { + check_value!(self, row_index); + mapping(self.0.value(row_index)).map(Some) + } +} + +impl Decoder> for ArrayDecoder<'_, BooleanArray> { + fn decode(&self, row_index: usize) -> Result> { + self.value(row_index, Ok) + } +} + +impl Decoder> for ArrayDecoder<'_, Int8Array> { + fn decode(&self, row_index: usize) -> Result> { + self.value(row_index, decode_i32) + } +} + +impl Decoder> for ArrayDecoder<'_, Int8Array> { + fn decode(&self, row_index: usize) -> Result> { + self.value(row_index, decode_i64) + } +} + +impl Decoder> for ArrayDecoder<'_, Int8Array> { + fn decode(&self, row_index: usize) -> Result> { + self.value(row_index, |x| decode_signed_big_int(x.to_le_bytes())) + } +} + +impl Decoder> for ArrayDecoder<'_, Int16Array> { + fn decode(&self, row_index: usize) -> Result> { + self.value(row_index, decode_i32) + } +} + +impl Decoder> for ArrayDecoder<'_, Int16Array> { + fn decode(&self, row_index: usize) -> Result> { + self.value(row_index, decode_i64) + } +} + +impl Decoder> for ArrayDecoder<'_, Int16Array> { + fn decode(&self, row_index: usize) -> Result> { + self.value(row_index, |x| decode_signed_big_int(x.to_le_bytes())) + } +} + +impl Decoder> for ArrayDecoder<'_, Int32Array> { + fn decode(&self, row_index: usize) -> Result> { + self.value(row_index, decode_i32) + } +} + +impl Decoder> for ArrayDecoder<'_, Int32Array> { + fn decode(&self, row_index: usize) -> Result> { + self.value(row_index, decode_i64) + } +} + +impl Decoder> for ArrayDecoder<'_, Int32Array> { + fn decode(&self, row_index: usize) -> Result> { + self.value(row_index, |x| decode_signed_big_int(x.to_le_bytes())) + } +} + +impl Decoder> for ArrayDecoder<'_, Int64Array> { + fn decode(&self, row_index: usize) -> Result> { + self.value(row_index, decode_i32) + } +} + +impl Decoder> for ArrayDecoder<'_, Int64Array> { + fn decode(&self, row_index: usize) -> Result> { + self.value(row_index, decode_i64) + } +} + +impl Decoder> for ArrayDecoder<'_, Int64Array> { + fn decode(&self, row_index: usize) -> Result> { + self.value(row_index, |x| decode_signed_big_int(x.to_le_bytes())) + } +} + +impl Decoder> for ArrayDecoder<'_, UInt8Array> { + fn decode(&self, row_index: usize) -> Result> { + self.value(row_index, decode_i32) + } +} + +impl Decoder> for ArrayDecoder<'_, UInt8Array> { + fn decode(&self, row_index: usize) -> Result> { + self.value(row_index, decode_i64) + } +} + +impl Decoder> for ArrayDecoder<'_, UInt8Array> { + fn decode(&self, row_index: usize) -> Result> { + self.value(row_index, |x| decode_unsigned_big_int(x.to_le_bytes())) + } +} + +impl Decoder> for ArrayDecoder<'_, UInt16Array> { + fn decode(&self, row_index: usize) -> Result> { + self.value(row_index, decode_i32) + } +} + +impl Decoder> for ArrayDecoder<'_, UInt16Array> { + fn decode(&self, row_index: usize) -> Result> { + self.value(row_index, decode_i64) + } +} + +impl Decoder> for ArrayDecoder<'_, UInt16Array> { + fn decode(&self, row_index: usize) -> Result> { + self.value(row_index, |x| decode_unsigned_big_int(x.to_le_bytes())) + } +} + +impl Decoder> for ArrayDecoder<'_, UInt32Array> { + fn decode(&self, row_index: usize) -> Result> { + self.value(row_index, decode_i32) + } +} + +impl Decoder> for ArrayDecoder<'_, UInt32Array> { + fn decode(&self, row_index: usize) -> Result> { + self.value(row_index, decode_i64) + } +} + +impl Decoder> for ArrayDecoder<'_, UInt32Array> { + fn decode(&self, row_index: usize) -> Result> { + self.value(row_index, |x| decode_unsigned_big_int(x.to_le_bytes())) + } +} + +impl Decoder> for ArrayDecoder<'_, UInt64Array> { + fn decode(&self, row_index: usize) -> Result> { + self.value(row_index, decode_i32) + } +} + +impl Decoder> for ArrayDecoder<'_, UInt64Array> { + fn decode(&self, row_index: usize) -> Result> { + self.value(row_index, decode_i64) + } +} + +impl Decoder> for ArrayDecoder<'_, UInt64Array> { + fn decode(&self, row_index: usize) -> Result> { + self.value(row_index, Ok) + } +} + +impl Decoder> for ArrayDecoder<'_, UInt64Array> { + fn decode(&self, row_index: usize) -> Result> { + self.value(row_index, |x| decode_unsigned_big_int(x.to_le_bytes())) + } +} + +impl Decoder> for ArrayDecoder<'_, Float16Array> { + fn decode(&self, row_index: usize) -> Result> { + self.value(row_index, |x| Ok(f64::from(x).into())) + } +} + +impl Decoder> for ArrayDecoder<'_, Float32Array> { + fn decode(&self, row_index: usize) -> Result> { + self.value(row_index, |x| Ok(f64::from(x).into())) + } +} + +impl Decoder> for ArrayDecoder<'_, Float64Array> { + fn decode(&self, row_index: usize) -> Result> { + self.value(row_index, |x| Ok(x.into())) + } +} + +impl Decoder> for ArrayDecoder<'_, Decimal128Array> { + fn decode(&self, row_index: usize) -> Result> { + self.value(row_index, decode_i32) + } +} + +impl Decoder> for ArrayDecoder<'_, Decimal128Array> { + fn decode(&self, row_index: usize) -> Result> { + self.value(row_index, decode_i64) + } +} + +impl Decoder> for ArrayDecoder<'_, Decimal128Array> { + fn decode(&self, row_index: usize) -> Result> { + self.value(row_index, |x| decode_signed_big_int(x.to_le_bytes())) + } +} + +impl Decoder> for ArrayDecoder<'_, Decimal128Array> { + fn decode(&self, row_index: usize) -> Result> { + self.value(row_index, |x| { + let scale = self.0.scale() as i64; + let big_int = decode_signed_big_int(x.to_le_bytes())?; + + Ok(BigDecimal::new(big_int, -scale)) + }) + } +} + +impl Decoder> for ArrayDecoder<'_, Decimal256Array> { + fn decode(&self, row_index: usize) -> Result> { + self.value(row_index, |x| decode_i32(x.as_i128())) + } +} + +impl Decoder> for ArrayDecoder<'_, Decimal256Array> { + fn decode(&self, row_index: usize) -> Result> { + self.value(row_index, |x| decode_i64(x.as_i128())) + } +} + +impl Decoder> for ArrayDecoder<'_, Decimal256Array> { + fn decode(&self, row_index: usize) -> Result> { + self.value(row_index, |x| decode_signed_big_int(x.to_le_bytes())) + } +} + +impl Decoder> for ArrayDecoder<'_, Decimal256Array> { + fn decode(&self, row_index: usize) -> Result> { + self.value(row_index, |x| { + let scale = self.0.scale() as i64; + let big_int = decode_signed_big_int(x.to_le_bytes())?; + + Ok(BigDecimal::new(big_int, -scale)) + }) + } +} + +impl Decoder> for ArrayDecoder<'_, StringArray> { + fn decode(&self, row_index: usize) -> Result> { + self.value(row_index, |x| Ok(x.to_string())) + } +} + +impl Decoder> for ArrayDecoder<'_, StringViewArray> { + fn decode(&self, row_index: usize) -> Result> { + self.value(row_index, |x| Ok(x.to_string())) + } +} + +impl Decoder> for ArrayDecoder<'_, LargeStringArray> { + fn decode(&self, row_index: usize) -> Result> { + self.value(row_index, |x| Ok(x.to_string())) + } +} + +impl Decoder>> for ArrayDecoder<'_, BinaryArray> { + fn decode(&self, row_index: usize) -> Result>> { + self.value(row_index, |x| Ok(x.into())) + } +} + +impl Decoder>> for ArrayDecoder<'_, BinaryViewArray> { + fn decode(&self, row_index: usize) -> Result>> { + self.value(row_index, |x| Ok(x.into())) + } +} + +impl Decoder>> for ArrayDecoder<'_, FixedSizeBinaryArray> { + fn decode(&self, row_index: usize) -> Result>> { + self.value(row_index, |x| Ok(x.into())) + } +} + +impl Decoder> for ArrayDecoder<'_, FixedSizeBinaryArray> { + fn decode(&self, row_index: usize) -> Result> { + self.value(row_index, |x| { + B256::try_from(x) + .map_err(|_| anyhow!("failed to convert '{}' to 'B256'", hex::encode(x))) + }) + } +} + +impl Decoder>> for ArrayDecoder<'_, LargeBinaryArray> { + fn decode(&self, row_index: usize) -> Result>> { + self.value(row_index, |x| Ok(x.into())) + } +} + +impl Decoder>> for ArrayDecoder<'_, TimestampSecondArray> { + fn decode(&self, row_index: usize) -> Result>> { + check_value!(self, row_index); + decode_timestamp(self.0, row_index).map(Some) + } +} + +impl Decoder>> for ArrayDecoder<'_, TimestampMillisecondArray> { + fn decode(&self, row_index: usize) -> Result>> { + check_value!(self, row_index); + decode_timestamp(self.0, row_index).map(Some) + } +} + +impl Decoder>> for ArrayDecoder<'_, TimestampMicrosecondArray> { + fn decode(&self, row_index: usize) -> Result>> { + check_value!(self, row_index); + decode_timestamp(self.0, row_index).map(Some) + } +} + +impl Decoder>> for ArrayDecoder<'_, TimestampNanosecondArray> { + fn decode(&self, row_index: usize) -> Result>> { + check_value!(self, row_index); + decode_timestamp(self.0, row_index).map(Some) + } +} + +fn downcast_ref<'a, T>(array: &'a dyn Array) -> Result<&'a T> +where + T: Array + 'static, +{ + array + .as_any() + .downcast_ref() + .ok_or_else(|| anyhow!("failed to downcast array")) +} + +fn decode_i32(n: T) -> Result +where + T: TryInto + Copy + Display, +{ + n.try_into() + .map_err(|_| anyhow!("failed to convert '{n}' to 'i32'")) +} + +fn decode_i64(n: T) -> Result +where + T: TryInto + Copy + Display, +{ + n.try_into() + .map_err(|_| anyhow!("failed to convert '{n}' to 'i64'")) +} + +fn decode_signed_big_int(le_bytes: impl AsRef<[u8]>) -> Result { + let le_bytes = le_bytes.as_ref(); + + BigInt::from_signed_bytes_le(le_bytes) + .map_err(|_| anyhow!("failed to convert '{}' to 'BigInt'", hex::encode(le_bytes))) +} + +fn decode_unsigned_big_int(le_bytes: impl AsRef<[u8]>) -> Result { + let le_bytes = le_bytes.as_ref(); + + BigInt::from_unsigned_bytes_le(le_bytes) + .map_err(|_| anyhow!("failed to convert '{}' to 'BigInt'", hex::encode(le_bytes))) +} + +fn decode_timestamp(array: &PrimitiveArray, row_index: usize) -> Result> +where + T: ArrowTemporalType, + i64: From, +{ + static UTC: LazyLock = LazyLock::new(|| "+00:00".parse().unwrap()); + + let Some(timestamp) = array.value_as_datetime_with_tz(row_index, *UTC) else { + return Err(anyhow!("failed to decode timestamp; unknown timezone")); + }; + + Ok(timestamp.to_utc()) +} diff --git a/graph/src/nozzle/codec/decoder.rs b/graph/src/nozzle/codec/decoder.rs new file mode 100644 index 00000000000..6d433ba86f2 --- /dev/null +++ b/graph/src/nozzle/codec/decoder.rs @@ -0,0 +1,29 @@ +use anyhow::Result; + +/// Decodes Arrow data at specific row indices into Subgraph types. +/// +/// This trait provides a common interface for converting Arrow format data into +/// custom types. Implementations handle the specifics of extracting data from +/// Arrow arrays and constructing the target type `T`. +pub(super) trait Decoder { + /// Decodes and returns the value at the `row_index`. + /// + /// # Errors + /// + /// Returns an error if: + /// - The data cannot be converted to type `T` + /// - The underlying Arrow data is invalid or corrupted + /// + /// The returned error is deterministic. + fn decode(&self, row_index: usize) -> Result; +} + +/// Forwards decoding operations through boxed trait objects. +/// +/// This implementation enables using `Box>` as a decoder, +/// delegating to the underlying implementation. +impl Decoder for Box + '_> { + fn decode(&self, row_index: usize) -> Result { + (**self).decode(row_index) + } +} diff --git a/graph/src/nozzle/codec/list_decoder.rs b/graph/src/nozzle/codec/list_decoder.rs new file mode 100644 index 00000000000..4c0e2a44504 --- /dev/null +++ b/graph/src/nozzle/codec/list_decoder.rs @@ -0,0 +1,88 @@ +use anyhow::Result; + +use super::decoder::Decoder; + +/// Decodes Arrow lists to vectors of decoded values. +pub(super) struct ListDecoder<'a, T> { + decoder: T, + offsets: ArrayOffsets<'a>, +} + +/// Contains row index offsets used to determine how many values to decode from an Arrow list. +pub(super) enum ArrayOffsets<'a> { + Small(&'a [i32]), + Large(&'a [i64]), + Fixed(i32), +} + +impl<'a, T> ListDecoder<'a, T> { + /// Creates a new Arrow list decoder with provided `offsets`. + pub(super) fn new(decoder: T, offsets: ArrayOffsets<'a>) -> Self { + Self { decoder, offsets } + } +} + +impl<'a, T, V> Decoder>> for ListDecoder<'a, T> +where + T: Decoder, +{ + fn decode(&self, row_index: usize) -> Result>> { + let Some(range) = self.offsets.range(row_index) else { + return Ok(None); + }; + + let values = range + .map(|row_index| self.decoder.decode(row_index)) + .collect::, _>>()?; + + if values.is_empty() { + return Ok(None); + } + + Ok(Some(values)) + } +} + +impl<'a> ArrayOffsets<'a> { + /// Returns row indices belonging to a list at `row_index`. + fn range(&self, row_index: usize) -> Option> { + match self { + Self::Small(offsets) => { + let start = *offsets.get(row_index)? as usize; + let end = *offsets.get(row_index + 1)? as usize; + + Some(start..end) + } + Self::Large(offsets) => { + let start = *offsets.get(row_index)? as usize; + let end = *offsets.get(row_index + 1)? as usize; + + Some(start..end) + } + Self::Fixed(value_length) => { + let start = *value_length as usize * row_index; + let end = *value_length as usize * (row_index + 1); + + Some(start..end) + } + } + } +} + +impl<'a> From<&'a [i32]> for ArrayOffsets<'a> { + fn from(offsets: &'a [i32]) -> Self { + Self::Small(offsets) + } +} + +impl<'a> From<&'a [i64]> for ArrayOffsets<'a> { + fn from(offsets: &'a [i64]) -> Self { + Self::Large(offsets) + } +} + +impl From for ArrayOffsets<'static> { + fn from(value_length: i32) -> Self { + Self::Fixed(value_length) + } +} diff --git a/graph/src/nozzle/codec/mapping_decoder.rs b/graph/src/nozzle/codec/mapping_decoder.rs new file mode 100644 index 00000000000..b0c85e9d2e6 --- /dev/null +++ b/graph/src/nozzle/codec/mapping_decoder.rs @@ -0,0 +1,32 @@ +use anyhow::Result; + +use super::decoder::Decoder; + +/// Decodes Arrow arrays and maps the decoded values to a different type. +pub(super) struct MappingDecoder { + decoder: T, + mapping: Box V + 'static>, +} + +impl MappingDecoder { + /// Creates a new decoder that wraps the `decoder`. + /// + /// The `mapping` function transforms decoded values from type `U` to type `V`. + pub(super) fn new(decoder: T, mapping: impl Fn(U) -> V + 'static) -> Self { + Self { + decoder, + mapping: Box::new(mapping), + } + } +} + +impl Decoder for MappingDecoder +where + T: Decoder, +{ + fn decode(&self, row_index: usize) -> Result { + let value = self.decoder.decode(row_index)?; + + Ok((&self.mapping)(value)) + } +} diff --git a/graph/src/nozzle/codec/mod.rs b/graph/src/nozzle/codec/mod.rs new file mode 100644 index 00000000000..3cb55e71227 --- /dev/null +++ b/graph/src/nozzle/codec/mod.rs @@ -0,0 +1,238 @@ +mod array_decoder; +mod decoder; +mod list_decoder; +mod mapping_decoder; +mod name_cache; +mod value_decoder; + +use std::collections::{BTreeMap, HashMap}; + +use anyhow::{anyhow, bail, Context, Result}; +use arrow::array::{Array, RecordBatch}; + +use self::{ + array_decoder::ArrayDecoder, + decoder::Decoder, + list_decoder::ListDecoder, + mapping_decoder::MappingDecoder, + name_cache::{NameCache, NormalizedName}, +}; +use crate::{ + data::{ + graphql::TypeExt, + store::{Id, IdType, Value}, + value::Word, + }, + schema::{EntityKey, EntityType, Field, InputSchema}, +}; + +/// Handles decoding of record batches to Subgraph entities. +pub struct Codec { + input_schema: InputSchema, + name_cache: NameCache, +} + +/// Contains the entities decoded from a record batch. +pub struct DecodeOutput { + /// The type of entities in this batch. + pub entity_type: EntityType, + + /// The type of the ID of entities in this batch. + pub id_type: IdType, + + /// A list of decoded entities of the same type. + pub decoded_entities: Vec, +} + +/// Contains a single entity decoded from a record batch. +pub struct DecodedEntity { + /// The unique ID of the entity. + /// + /// When set to `None`, the ID is expected to be auto-generated before a new entity is persisted. + pub key: Option, + + /// A list of entity field names and their values. + /// + /// This list could contain a subset of fields of an entity. + pub entity_data: Vec<(Word, Value)>, +} + +impl Codec { + /// Creates a new decoder for the `input_schema`. + pub fn new(input_schema: InputSchema) -> Self { + let name_cache = NameCache::new(); + + Self { + input_schema, + name_cache, + } + } + + /// Decodes a `record_batch` according to the schema of the entity with name `entity_name`. + /// + /// # Errors + /// + /// Returns an error if `record_batch` is not compatible with the schema of the entity with name `entity_name`. + /// + /// The returned error is deterministic. + pub fn decode(&mut self, record_batch: RecordBatch, entity_name: &str) -> Result { + let entity_type = self.entity_type(entity_name)?; + let id_type = entity_type.id_type()?; + let value_decoders = self.value_decoders(&entity_type, &record_batch)?; + let mut decoded_entities = Vec::with_capacity(record_batch.num_rows()); + + for i in 0..record_batch.num_rows() { + let err_ctx = |s: &str| format!("field '{s}' at row {i}"); + let mut entity_id: Option = None; + let mut entity_data = Vec::with_capacity(value_decoders.len()); + + for (&field_name, value_decoder) in &value_decoders { + let value = value_decoder + .decode(i) + .with_context(|| err_ctx(field_name))?; + + if field_name.eq_ignore_ascii_case("id") { + entity_id = Some(value.clone()); + } + + entity_data.push((Word::from(field_name), value)); + } + + let entity_key = entity_id + .map(Id::try_from) + .transpose() + .with_context(|| err_ctx("id"))? + .map(|entity_id| entity_type.key(entity_id)); + + decoded_entities.push(DecodedEntity { + key: entity_key, + entity_data, + }); + } + + drop(value_decoders); + + Ok(DecodeOutput { + entity_type, + id_type, + decoded_entities, + }) + } + + /// Returns the type of the entity with name `entity_name`. + /// + /// # Errors + /// + /// Returns an error if: + /// - There is no entity with name `entity_name` + /// - The entity is not an object + /// - The entity is a POI entity + /// + /// The returned error is deterministic. + fn entity_type(&self, entity_name: &str) -> Result { + let entity_type = self + .input_schema + .entity_type(entity_name) + .context("entity not found")?; + + if !entity_type.is_object_type() { + return Err(anyhow!("entity is not an object")); + } + + if entity_type.is_poi() { + return Err(anyhow!("entity is POI entity")); + } + + Ok(entity_type) + } + + /// Creates and returns value decoders for the fields of the entity with name `entity_name`. + /// + /// # Errors + /// + /// Returns an error if a decoder could not be created for a required field. + /// + /// The returned error is deterministic. + fn value_decoders<'a>( + &mut self, + entity_type: &'a EntityType, + record_batch: &'a RecordBatch, + ) -> Result + 'a>>> { + let object_type = entity_type.object_type().unwrap(); + let columns = record_batch + .schema_ref() + .fields() + .into_iter() + .zip(record_batch.columns()) + .map(|(field, array)| (self.normalized_name(field.name()), array.as_ref())) + .collect::>(); + + let mut value_decoders = BTreeMap::new(); + for field in &object_type.fields { + let Some(value_decoder) = self.value_decoder(field, &columns)? else { + continue; + }; + + value_decoders.insert(field.name.as_str(), value_decoder); + } + + Ok(value_decoders) + } + + /// Creates and returns a value decoder for the `field`. + /// + /// Returns `None` when the `field` does not require a decoder. + /// This happens for derived fields, reserved fields, and when there is no associated + /// Arrow array for a nullable `field` or a `field` that could be auto-generated. + /// + /// # Errors + /// + /// Returns an error if: + /// - There is no associated Arrow array for a required `field` + /// - The `field` type is not compatible with the Arrow array + /// + /// The returned error is deterministic. + fn value_decoder<'a>( + &mut self, + field: &'a Field, + columns: &HashMap, + ) -> Result + 'a>>> { + // VIDs are auto-generated + if field.name.eq_ignore_ascii_case("vid") { + return Ok(None); + } + + // Derived fields are handled automatically + if field.is_derived() { + return Ok(None); + } + + let normalized_name = self.normalized_name(&field.name); + let array = match columns.get(&normalized_name) { + Some(&array) => array, + None => { + // Allow ID auto-generation + if field.name.eq_ignore_ascii_case("id") { + return Ok(None); + } + + // Allow partial entities + if !field.field_type.is_non_null() { + return Ok(None); + } + + bail!("failed to get column for field '{}'", field.name); + } + }; + + let decoder = value_decoder::value_decoder(field.value_type, field.is_list(), array) + .with_context(|| format!("failed to create decoder for field '{}'", field.name))?; + + Ok(Some(decoder)) + } + + // Returns a normalized version of `name`. + fn normalized_name(&mut self, name: impl AsRef) -> NormalizedName { + self.name_cache.normalized(name.as_ref()) + } +} diff --git a/graph/src/nozzle/codec/name_cache.rs b/graph/src/nozzle/codec/name_cache.rs new file mode 100644 index 00000000000..8a56a1558fc --- /dev/null +++ b/graph/src/nozzle/codec/name_cache.rs @@ -0,0 +1,77 @@ +use std::{collections::HashMap, sync::Arc}; + +use heck::ToSnakeCase; + +use crate::{cheap_clone::CheapClone, derive::CheapClone}; + +/// Provides case-insensitive string comparison through normalization. +/// +/// Normalizes names and stores them in memory for fast access. +pub(super) struct NameCache { + cache: HashMap, NormalizedName>, +} + +/// Contains a normalized name. +/// +/// A normalized name is a list of lowercase tokens from the original name. +/// A token is a sequence of characters between case format separators. +#[derive(Debug, Clone, CheapClone, PartialEq, Eq, PartialOrd, Ord, Hash)] +pub(super) struct NormalizedName(Arc<[Box]>); + +impl NameCache { + /// Creates a new empty cache. + pub(super) fn new() -> Self { + Self { + cache: HashMap::new(), + } + } + + /// Takes a `name` with any case format and returns a normalized version. + /// + /// A normalized version is a list of lowercase tokens from the original input. + /// A token is a sequence of characters between case format separators. + /// + /// # Example + /// + /// ```no_run + /// let mut name_cache = NameCache::new(); + /// + /// assert_eq!( + /// name_cache.normalized("blockNumber"), + /// vec!["block".into(), "number".into()].into(), + /// ); + /// assert_eq!( + /// name_cache.normalized("block number"), + /// vec!["block".into(), "number".into()].into(), + /// ); + /// assert_eq!( + /// name_cache.normalized("block_number"), + /// vec!["block".into(), "number".into()].into(), + /// ); + /// ``` + pub(super) fn normalized(&mut self, name: &str) -> NormalizedName { + if let Some(normalized_name) = self.cache.get(name) { + return normalized_name.cheap_clone(); + } + + let normalized_name = NormalizedName::new(name); + + self.cache + .insert(name.into(), normalized_name.cheap_clone()); + + normalized_name + } +} + +impl NormalizedName { + /// Creates a normalized name from the input string. + fn new(name: &str) -> Self { + Self( + name.to_snake_case() + .split('_') + .map(Into::into) + .collect::>() + .into(), + ) + } +} diff --git a/graph/src/nozzle/codec/value_decoder.rs b/graph/src/nozzle/codec/value_decoder.rs new file mode 100644 index 00000000000..3a56f9b21a8 --- /dev/null +++ b/graph/src/nozzle/codec/value_decoder.rs @@ -0,0 +1,350 @@ +use anyhow::{anyhow, Context, Result}; +use arrow::{ + array::{ + Array, BinaryArray, BinaryViewArray, BooleanArray, Decimal128Array, Decimal256Array, + FixedSizeBinaryArray, FixedSizeListArray, Float16Array, Float32Array, Float64Array, + Int16Array, Int32Array, Int64Array, Int8Array, LargeBinaryArray, LargeListArray, + LargeListViewArray, LargeStringArray, ListArray, ListViewArray, StringArray, + StringViewArray, TimestampMicrosecondArray, TimestampMillisecondArray, + TimestampNanosecondArray, TimestampSecondArray, UInt16Array, UInt32Array, UInt64Array, + UInt8Array, + }, + datatypes::{DataType, TimeUnit}, +}; +use chrono::{DateTime, Utc}; + +use super::{ArrayDecoder, Decoder, ListDecoder, MappingDecoder}; +use crate::data::store::{ + scalar::{BigDecimal, BigInt, Bytes, Timestamp}, + Value, ValueType, +}; + +/// Returns a decoder that converts an Arrow array into Subgraph store values. +/// +/// # Errors +/// +/// Returns an error if the Subgraph store type is not compatible with the Arrow array type. +/// +/// The returned error is deterministic. +pub(super) fn value_decoder<'a>( + value_type: ValueType, + is_list: bool, + array: &'a dyn Array, +) -> Result + 'a>> { + let decoder = if is_list { + list_value_decoder(value_type, array) + } else { + single_value_decoder(value_type, array) + }; + + decoder.with_context(|| { + format!( + "failed to decode '{}' from '{}'", + value_type.to_str(), + array.data_type(), + ) + }) +} + +fn list_value_decoder<'a>( + value_type: ValueType, + array: &'a dyn Array, +) -> Result + 'a>> { + match array.data_type() { + DataType::List(_) => { + let list = array.as_any().downcast_ref::().unwrap(); + let decoder = single_value_decoder(value_type, list.values())?; + let list_decoder = ListDecoder::new(decoder, list.value_offsets().into()); + + Ok(mapping_decoder(list_decoder, Value::List)) + } + DataType::ListView(_) => { + let list = array.as_any().downcast_ref::().unwrap(); + let decoder = single_value_decoder(value_type, list.values())?; + let list_decoder = ListDecoder::new(decoder, list.value_offsets().into()); + + Ok(mapping_decoder(list_decoder, Value::List)) + } + DataType::FixedSizeList(_, _) => { + let list = array.as_any().downcast_ref::().unwrap(); + let decoder = single_value_decoder(value_type, list.values())?; + let list_decoder = ListDecoder::new(decoder, list.value_length().into()); + + Ok(mapping_decoder(list_decoder, Value::List)) + } + DataType::LargeList(_) => { + let list = array.as_any().downcast_ref::().unwrap(); + let decoder = single_value_decoder(value_type, list.values())?; + let list_decoder = ListDecoder::new(decoder, list.value_offsets().into()); + + Ok(mapping_decoder(list_decoder, Value::List)) + } + DataType::LargeListView(_) => { + let list = array.as_any().downcast_ref::().unwrap(); + let decoder = single_value_decoder(value_type, list.values())?; + let list_decoder = ListDecoder::new(decoder, list.value_offsets().into()); + + Ok(mapping_decoder(list_decoder, Value::List)) + } + data_type => Err(anyhow!("'{data_type}' is not a supported list type")), + } +} + +fn single_value_decoder<'a>( + value_type: ValueType, + array: &'a dyn Array, +) -> Result + 'a>> { + let incompatible_types_err = || Err(anyhow!("incompatible types")); + + let decoder = match (value_type, array.data_type()) { + (ValueType::Boolean, DataType::Boolean) => { + let array_decoder = ArrayDecoder::::new(array)?; + mapping_decoder(array_decoder, Value::Bool) + } + (ValueType::Boolean, _) => return incompatible_types_err(), + + (ValueType::Int, data_type) if is_integer(data_type) => { + let integer_decoder = integer_decoder::>(array)?; + mapping_decoder(integer_decoder, Value::Int) + } + (ValueType::Int, _) => return incompatible_types_err(), + + (ValueType::Int8, data_type) if is_integer(data_type) => { + let integer_decoder = integer_decoder::>(array)?; + mapping_decoder(integer_decoder, Value::Int8) + } + (ValueType::Int8, _) => return incompatible_types_err(), + + (ValueType::BigInt, data_type) if is_integer(data_type) => { + let integer_decoder = integer_decoder::>(array)?; + mapping_decoder(integer_decoder, Value::BigInt) + } + (ValueType::BigInt, _) => return incompatible_types_err(), + + (ValueType::BigDecimal, data_type) if is_decimal(data_type) => { + let decimal_decoder = decimal_decoder::>(array)?; + mapping_decoder(decimal_decoder, Value::BigDecimal) + } + (ValueType::BigDecimal, _) => return incompatible_types_err(), + + (ValueType::Bytes, data_type) if is_binary(data_type) => { + let binary_decoder = binary_decoder::>>(array)?; + mapping_decoder(binary_decoder, |x| Bytes::from(&*x).into()) + } + (ValueType::Bytes, _) => return incompatible_types_err(), + + (ValueType::String, data_type) if is_string(data_type) => { + let string_decoder = string_decoder::>(array)?; + mapping_decoder(string_decoder, Value::String) + } + (ValueType::String, data_type) if is_integer(data_type) => { + let integer_decoder = integer_decoder::>(array)?; + mapping_decoder(integer_decoder, |x| x.to_string().into()) + } + (ValueType::String, data_type) if is_binary(data_type) => { + let binary_decoder = binary_decoder::>>(array)?; + mapping_decoder(binary_decoder, |x| format!("0x{}", hex::encode(x)).into()) + } + (ValueType::String, _) => return incompatible_types_err(), + + (ValueType::Timestamp, data_type) if is_timestamp(data_type) => { + let timestamp_decoder = timestamp_decoder::>>(array)?; + mapping_decoder(timestamp_decoder, |x| Timestamp(x).into()) + } + (ValueType::Timestamp, _) => return incompatible_types_err(), + }; + + Ok(decoder) +} + +fn mapping_decoder<'a, T, U: 'static>( + array_decoder: T, + mapping: fn(U) -> Value, +) -> Box + 'a> +where + T: Decoder> + 'a, +{ + Box::new(MappingDecoder::new( + array_decoder, + move |value: Option| match value { + Some(value) => mapping(value), + None => Value::Null, + }, + )) +} + +fn is_integer(data_type: &DataType) -> bool { + use DataType::*; + + matches! { + data_type, + Int8 | Int16 | Int32 | Int64 | + UInt8 | UInt16 | UInt32 | UInt64 | + Decimal128(_, 0) | Decimal256(_, 0) + } +} + +fn integer_decoder<'a, T>(array: &'a dyn Array) -> Result + 'a>> +where + T: 'static, + ArrayDecoder<'a, Int8Array>: Decoder, + ArrayDecoder<'a, Int16Array>: Decoder, + ArrayDecoder<'a, Int32Array>: Decoder, + ArrayDecoder<'a, Int64Array>: Decoder, + ArrayDecoder<'a, UInt8Array>: Decoder, + ArrayDecoder<'a, UInt16Array>: Decoder, + ArrayDecoder<'a, UInt32Array>: Decoder, + ArrayDecoder<'a, UInt64Array>: Decoder, + ArrayDecoder<'a, Decimal128Array>: Decoder, + ArrayDecoder<'a, Decimal256Array>: Decoder, +{ + use DataType::*; + + let array_decoder: Box> = match array.data_type() { + Int8 => Box::new(ArrayDecoder::::new(array)?), + Int16 => Box::new(ArrayDecoder::::new(array)?), + Int32 => Box::new(ArrayDecoder::::new(array)?), + Int64 => Box::new(ArrayDecoder::::new(array)?), + UInt8 => Box::new(ArrayDecoder::::new(array)?), + UInt16 => Box::new(ArrayDecoder::::new(array)?), + UInt32 => Box::new(ArrayDecoder::::new(array)?), + UInt64 => Box::new(ArrayDecoder::::new(array)?), + Decimal128(_, 0) => Box::new(ArrayDecoder::::new(array)?), + Decimal256(_, 0) => Box::new(ArrayDecoder::::new(array)?), + data_type => return Err(anyhow!("'{data_type}' is not a supported integer type")), + }; + + Ok(array_decoder) +} + +fn is_decimal(data_type: &DataType) -> bool { + use DataType::*; + + matches! { + data_type, + Float16 | Float32 | Float64 | + Decimal128(_, _) | Decimal256(_, _) + } +} + +fn decimal_decoder<'a, T>(array: &'a dyn Array) -> Result + 'a>> +where + T: 'static, + ArrayDecoder<'a, Float16Array>: Decoder, + ArrayDecoder<'a, Float32Array>: Decoder, + ArrayDecoder<'a, Float64Array>: Decoder, + ArrayDecoder<'a, Decimal128Array>: Decoder, + ArrayDecoder<'a, Decimal256Array>: Decoder, +{ + use DataType::*; + + let array_decoder: Box> = match array.data_type() { + Float16 => Box::new(ArrayDecoder::::new(array)?), + Float32 => Box::new(ArrayDecoder::::new(array)?), + Float64 => Box::new(ArrayDecoder::::new(array)?), + Decimal128(_, _) => Box::new(ArrayDecoder::::new(array)?), + Decimal256(_, _) => Box::new(ArrayDecoder::::new(array)?), + data_type => return Err(anyhow!("'{data_type}' is not a supported decimal type")), + }; + + Ok(array_decoder) +} + +fn is_binary(data_type: &DataType) -> bool { + use DataType::*; + + matches! { + data_type, + Binary | BinaryView | FixedSizeBinary(_) | LargeBinary + } +} + +fn binary_decoder<'a, T>(array: &'a dyn Array) -> Result + 'a>> +where + T: 'static, + ArrayDecoder<'a, BinaryArray>: Decoder, + ArrayDecoder<'a, BinaryViewArray>: Decoder, + ArrayDecoder<'a, FixedSizeBinaryArray>: Decoder, + ArrayDecoder<'a, LargeBinaryArray>: Decoder, +{ + use DataType::*; + + let array_decoder: Box> = match array.data_type() { + Binary => Box::new(ArrayDecoder::::new(array)?), + BinaryView => Box::new(ArrayDecoder::::new(array)?), + FixedSizeBinary(_) => Box::new(ArrayDecoder::::new(array)?), + LargeBinary => Box::new(ArrayDecoder::::new(array)?), + data_type => return Err(anyhow!("'{data_type}' is not a supported binary type")), + }; + + Ok(array_decoder) +} + +fn is_string(data_type: &DataType) -> bool { + use DataType::*; + + matches! { + data_type, + Utf8 | Utf8View | LargeUtf8 + } +} + +fn string_decoder<'a, T>(array: &'a dyn Array) -> Result + 'a>> +where + T: 'static, + ArrayDecoder<'a, StringArray>: Decoder, + ArrayDecoder<'a, StringViewArray>: Decoder, + ArrayDecoder<'a, LargeStringArray>: Decoder, +{ + use DataType::*; + + let array_decoder: Box> = match array.data_type() { + Utf8 => Box::new(ArrayDecoder::::new(array)?), + Utf8View => Box::new(ArrayDecoder::::new(array)?), + LargeUtf8 => Box::new(ArrayDecoder::::new(array)?), + data_type => return Err(anyhow!("'{data_type}' is not a supported string type")), + }; + + Ok(array_decoder) +} + +fn is_timestamp(data_type: &DataType) -> bool { + use DataType::*; + + matches! { + data_type, + Timestamp(TimeUnit::Second, _) | + Timestamp(TimeUnit::Millisecond, _) | + Timestamp(TimeUnit::Microsecond, _) | + Timestamp(TimeUnit::Nanosecond, _) + } +} + +fn timestamp_decoder<'a, T>(array: &'a dyn Array) -> Result + 'a>> +where + T: 'static, + ArrayDecoder<'a, TimestampSecondArray>: Decoder, + ArrayDecoder<'a, TimestampMillisecondArray>: Decoder, + ArrayDecoder<'a, TimestampMicrosecondArray>: Decoder, + ArrayDecoder<'a, TimestampNanosecondArray>: Decoder, +{ + use DataType::*; + + let array_decoder: Box> = match array.data_type() { + Timestamp(TimeUnit::Second, _) => { + Box::new(ArrayDecoder::::new(array)?) // + } + Timestamp(TimeUnit::Millisecond, _) => { + Box::new(ArrayDecoder::::new(array)?) // + } + Timestamp(TimeUnit::Microsecond, _) => { + Box::new(ArrayDecoder::::new(array)?) // + } + Timestamp(TimeUnit::Nanosecond, _) => { + Box::new(ArrayDecoder::::new(array)?) // + } + data_type => return Err(anyhow!("'{data_type}' is not a supported timestamp type")), + }; + + Ok(array_decoder) +} diff --git a/graph/src/nozzle/mod.rs b/graph/src/nozzle/mod.rs index 9a8f5817c0e..49aa4707951 100644 --- a/graph/src/nozzle/mod.rs +++ b/graph/src/nozzle/mod.rs @@ -1,9 +1,13 @@ //! This module contains the functionality required to support Nozzle Subgraphs. pub mod client; +pub mod codec; pub mod common; pub mod error; pub mod log; pub mod stream_aggregator; -pub use self::client::{flight_client::FlightClient, Client}; +pub use self::{ + client::{flight_client::FlightClient, Client}, + codec::Codec, +}; From 0dd263beac7a4ffa3b68fd3ac429b76d280d93ee Mon Sep 17 00:00:00 2001 From: Ion Suman <47307091+isum@users.noreply.github.com> Date: Thu, 25 Sep 2025 16:58:07 +0300 Subject: [PATCH 04/40] feat(graph): add SQL query parser, resolver and validator --- Cargo.lock | 16 +- Cargo.toml | 1 + graph/Cargo.toml | 1 + graph/src/nozzle/mod.rs | 1 + graph/src/nozzle/sql/mod.rs | 3 + graph/src/nozzle/sql/query/filter_blocks.rs | 174 ++++++++++++++++ graph/src/nozzle/sql/query/mod.rs | 186 ++++++++++++++++++ .../sql/query/resolve_event_signatures.rs | 106 ++++++++++ .../sql/query/resolve_source_address.rs | 81 ++++++++ graph/src/nozzle/sql/query/validate_tables.rs | 88 +++++++++ 10 files changed, 655 insertions(+), 2 deletions(-) create mode 100644 graph/src/nozzle/sql/mod.rs create mode 100644 graph/src/nozzle/sql/query/filter_blocks.rs create mode 100644 graph/src/nozzle/sql/query/mod.rs create mode 100644 graph/src/nozzle/sql/query/resolve_event_signatures.rs create mode 100644 graph/src/nozzle/sql/query/resolve_source_address.rs create mode 100644 graph/src/nozzle/sql/query/validate_tables.rs diff --git a/Cargo.lock b/Cargo.lock index 06b1c636052..979fccde810 100644 --- a/Cargo.lock +++ b/Cargo.lock @@ -2774,7 +2774,8 @@ dependencies = [ "slog-async", "slog-envlogger", "slog-term", - "sqlparser", + "sqlparser 0.57.0", + "sqlparser 0.59.0", "stable-hash 0.3.4", "stable-hash 0.4.4", "strum_macros 0.27.2", @@ -3043,7 +3044,7 @@ dependencies = [ "rand 0.9.2", "serde", "serde_json", - "sqlparser", + "sqlparser 0.59.0", "stable-hash 0.3.4", "thiserror 2.0.16", ] @@ -6345,6 +6346,17 @@ version = "0.3.2" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "3b9b39299b249ad65f3b7e96443bad61c02ca5cd3589f46cb6d610a0fd6c0d6a" +[[package]] +name = "sqlparser" +version = "0.57.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "07c5f081b292a3d19637f0b32a79e28ff14a9fd23ef47bd7fce08ff5de221eca" +dependencies = [ + "log", + "recursive", + "sqlparser_derive", +] + [[package]] name = "sqlparser" version = "0.59.0" diff --git a/Cargo.toml b/Cargo.toml index b5f2029580c..3c66469d523 100644 --- a/Cargo.toml +++ b/Cargo.toml @@ -105,6 +105,7 @@ arrow = { version = "=55.0.0" } arrow-flight = { version = "=55.0.0", features = ["flight-sql-experimental"] } heck = "0.5.0" lazy-regex = "3.4.1" +sqlparser-latest = { version = "0.57.0", package = "sqlparser", features = ["visitor"] } # Incremental compilation on Rust 1.58 causes an ICE on build. As soon as graph node builds again, these can be removed. [profile.test] diff --git a/graph/Cargo.toml b/graph/Cargo.toml index 30b68247d16..ac16e60d853 100644 --- a/graph/Cargo.toml +++ b/graph/Cargo.toml @@ -110,6 +110,7 @@ arrow-flight.workspace = true arrow.workspace = true heck.workspace = true lazy-regex.workspace = true +sqlparser-latest.workspace = true [dev-dependencies] clap.workspace = true diff --git a/graph/src/nozzle/mod.rs b/graph/src/nozzle/mod.rs index 49aa4707951..d3fbe063721 100644 --- a/graph/src/nozzle/mod.rs +++ b/graph/src/nozzle/mod.rs @@ -5,6 +5,7 @@ pub mod codec; pub mod common; pub mod error; pub mod log; +pub mod sql; pub mod stream_aggregator; pub use self::{ diff --git a/graph/src/nozzle/sql/mod.rs b/graph/src/nozzle/sql/mod.rs new file mode 100644 index 00000000000..a8f43f9078c --- /dev/null +++ b/graph/src/nozzle/sql/mod.rs @@ -0,0 +1,3 @@ +pub mod query; + +pub use self::query::Query; diff --git a/graph/src/nozzle/sql/query/filter_blocks.rs b/graph/src/nozzle/sql/query/filter_blocks.rs new file mode 100644 index 00000000000..e36ccf4ee51 --- /dev/null +++ b/graph/src/nozzle/sql/query/filter_blocks.rs @@ -0,0 +1,174 @@ +use std::{collections::BTreeMap, ops::ControlFlow}; + +use alloy::primitives::BlockNumber; +use sqlparser_latest::ast::{self, VisitMut, VisitorMut}; + +use super::{format::Ident, parse}; + +/// Applies a block range filter to the SQL query. +/// +/// Creates temporary ordered result sets for each table in the dataset, limiting +/// the blocks processed during execution. +/// +/// The temporary result sets replace the tables referenced in the SQL query. +/// +/// This ensures deterministic output during query execution and enables resuming +/// after failures or when new blocks are available. +pub(super) fn filter_blocks( + query: &mut ast::Query, + dataset: &Ident, + tables: &[Ident], + start_block: BlockNumber, + end_block: BlockNumber, +) { + let tables_to_cte_mapping = tables_to_cte_mapping(dataset, tables); + + let mut table_to_cte_replacer = TableToCteReplacer::new(&tables_to_cte_mapping); + let _: ControlFlow<()> = VisitMut::visit(query, &mut table_to_cte_replacer); + + match &mut query.with { + Some(with) => { + remove_cte_filters(&mut with.cte_tables, &tables_to_cte_mapping); + + add_cte_filters( + &mut with.cte_tables, + &tables_to_cte_mapping, + start_block, + end_block, + ); + } + None => { + let mut cte_tables = Vec::new(); + + add_cte_filters( + &mut cte_tables, + &tables_to_cte_mapping, + start_block, + end_block, + ); + + query.with = Some(ast::With { + with_token: ast::helpers::attached_token::AttachedToken::empty(), + recursive: false, + cte_tables, + }) + } + } +} + +// Maps `dataset` and `tables` to consistent names for temporary result sets. +fn tables_to_cte_mapping(dataset: &Ident, tables: &[Ident]) -> BTreeMap { + tables + .into_iter() + .map(|table| { + let dataset_table = format!("{dataset}.{table}"); + let cte_table = format!("sg_{dataset}_{table}"); + + (dataset_table, cte_table) + }) + .collect() +} + +/// Removes previously added temporary result sets from the SQL query. +fn remove_cte_filters(ctes: &mut Vec, tables_to_cte_mapping: &BTreeMap) { + ctes.retain(|cte| { + !tables_to_cte_mapping + .values() + .any(|cte_table| *cte_table == cte.alias.name.value) + }); +} + +/// Creates temporary result sets for each table in the dataset and adds them to the SQL query. +fn add_cte_filters( + ctes: &mut Vec, + tables_to_cte_mapping: &BTreeMap, + start_block: BlockNumber, + end_block: BlockNumber, +) { + let mut output_ctes = Vec::with_capacity(ctes.len() + tables_to_cte_mapping.len()); + + for (table, cte_table) in tables_to_cte_mapping { + let query = parse::query(format!( + "SELECT * FROM {table} WHERE _block_num BETWEEN {start_block} AND {end_block} ORDER BY _block_num ASC" + )) + .unwrap(); + + output_ctes.push(ast::Cte { + alias: ast::TableAlias { + name: ast::Ident::new(cte_table), + columns: Vec::new(), + }, + query: Box::new(query), + from: None, + materialized: None, + closing_paren_token: ast::helpers::attached_token::AttachedToken::empty(), + }); + } + + output_ctes.append(ctes); + let _empty = std::mem::replace(ctes, output_ctes); +} + +/// Walks the SQL AST and replaces each table reference with a temporary result set name. +struct TableToCteReplacer<'a> { + tables_to_cte_mapping: &'a BTreeMap, +} + +impl<'a> TableToCteReplacer<'a> { + /// Creates a new replacer. + fn new(tables_to_cte_mapping: &'a BTreeMap) -> Self { + Self { + tables_to_cte_mapping, + } + } + + /// Makes the `table_factor` reference a temporary result set instead of a table. + /// + /// Ignores unrelated table factors and table references without a namespace because + /// they might reference other CTEs. + fn visit_table_factor(&self, table_factor: &mut ast::TableFactor) { + let ast::TableFactor::Table { name, alias, .. } = table_factor else { + return; + }; + + let mut iter = name.0.iter().rev().map(|part| match part { + ast::ObjectNamePart::Identifier(ident) => ident.value.as_str(), + }); + + let Some(table) = iter.next() else { + return; + }; + + let Some(dataset) = iter.next() else { + return; + }; + + let dataset_table = format!("{}.{}", Ident::new(dataset), Ident::new(table)); + let Some(cte_table) = self.tables_to_cte_mapping.get(&dataset_table) else { + return; + }; + + if alias.is_none() { + *alias = Some(ast::TableAlias { + name: ast::Ident::new(table), + columns: Vec::new(), + }) + } + + *name = ast::ObjectName(vec![ast::ObjectNamePart::Identifier(ast::Ident::new( + cte_table, + ))]); + } +} + +impl<'a> VisitorMut for TableToCteReplacer<'a> { + type Break = (); + + fn pre_visit_table_factor( + &mut self, + table_factor: &mut ast::TableFactor, + ) -> ControlFlow { + self.visit_table_factor(table_factor); + ControlFlow::Continue(()) + } +} diff --git a/graph/src/nozzle/sql/query/mod.rs b/graph/src/nozzle/sql/query/mod.rs new file mode 100644 index 00000000000..f13c506afa2 --- /dev/null +++ b/graph/src/nozzle/sql/query/mod.rs @@ -0,0 +1,186 @@ +mod filter_blocks; +mod resolve_event_signatures; +mod resolve_source_address; +mod validate_tables; + +use std::fmt; + +use alloy::{ + json_abi::JsonAbi, + primitives::{Address, BlockNumber}, +}; +use anyhow::{bail, Context, Result}; +use itertools::Itertools; +use sqlparser_latest::ast; + +/// Represents a valid SQL query of a Nozzle Subgraph. +/// +/// Parses, validates and resolves a SQL query and prepares it for execution on a Nozzle server. +/// The data returned by executing this query is used to create Subgraph entities. +pub struct Query { + /// The raw SQL AST that represents the SQL query. + ast: ast::Query, + + /// The dataset that the SQL query requests data from. + dataset: format::Ident, + + /// The tables that the SQL query requests data from. + tables: Vec, +} + +/// Contains the ABI information that is used to resolve event signatures in SQL queries. +pub struct Abi<'a> { + /// The name of the contract. + pub name: &'a str, + + /// The JSON ABI of the contract. + pub contract: &'a JsonAbi, +} + +impl Query { + /// Parses, validates and resolves a SQL query and prepares it for execution on a Nozzle server. + /// + /// # Errors + /// + /// Returns an error if: + /// - The SQL query cannot be parsed + /// - The SQL query is not valid + /// - The SQL query cannot be resolved + /// + /// The returned error is deterministic. + pub fn new<'a>( + sql: impl AsRef, + dataset: impl AsRef, + tables: impl IntoIterator>, + source_address: &Address, + abis: impl IntoIterator>, + ) -> Result { + let mut query = parse::query(sql).context("failed to parse SQL query")?; + let dataset = format::Ident::new(dataset); + let tables = tables.into_iter().map(format::Ident::new).collect_vec(); + let abis = abis.into_iter().collect_vec(); + + Self::validate(&query, &dataset, &tables).context("failed to validate SQL query")?; + Self::resolve(&mut query, source_address, &abis).context("failed to resolve SQL query")?; + + Ok(Self { + ast: query, + dataset, + tables, + }) + } + + /// Applies a block range filter to this SQL query. + /// + /// Creates temporary ordered result sets for each table in the dataset, limiting + /// the blocks processed during execution. + /// + /// The temporary result sets replace the tables referenced in this SQL query. + /// + /// This ensures deterministic output during query execution and enables resuming + /// after failures or when new blocks are available. + pub fn filter_blocks(&mut self, start_block: BlockNumber, end_block: BlockNumber) { + filter_blocks::filter_blocks( + &mut self.ast, + &self.dataset, + &self.tables, + start_block, + end_block, + ); + } + + /// Validates the SQL query. + /// + /// # Errors + /// + /// Returns an error if: + /// - The SQL query references unknown tables and datasets + /// - The SQL query uses custom `SETTINGS` + /// + /// The returned error is deterministic. + fn validate( + query: &ast::Query, + dataset: &format::Ident, + tables: &[format::Ident], + ) -> Result<()> { + validate_tables::validate_tables(query, dataset, tables)?; + + if query.settings.is_some() { + bail!("custom SETTINGS are not allowed"); + } + + Ok(()) + } + + /// Resolves Subgraph-specific function calls in the SQL query. + /// + /// # Errors + /// + /// Returns an error if: + /// - Source address function calls cannot be resolved + /// - Event signature function calls cannot be resolved + /// + /// The returned error is deterministic. + fn resolve(query: &mut ast::Query, source_address: &Address, abis: &[Abi<'_>]) -> Result<()> { + resolve_source_address::resolve_source_address(query, source_address)?; + resolve_event_signatures::resolve_event_signatures(query, abis)?; + + Ok(()) + } +} + +impl fmt::Display for Query { + fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result { + write!(f, "{}", self.ast) + } +} + +mod format { + use std::fmt; + + /// Represents a normalized SQL identifier. + #[derive(Debug, Clone, PartialEq, Eq, PartialOrd, Ord, Hash)] + pub(super) struct Ident(Box); + + impl Ident { + /// Creates a normalized SQL identifier. + pub(super) fn new(s: impl AsRef) -> Self { + Self(s.as_ref().to_lowercase().into()) + } + } + + impl fmt::Display for Ident { + fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result { + write!(f, "{}", self.0) + } + } +} + +mod parse { + use anyhow::{anyhow, bail, Context, Result}; + use itertools::Itertools; + use sqlparser_latest::{ast, dialect::GenericDialect, parser::Parser}; + + /// Parses a SQL query and returns its AST. + /// + /// # Errors + /// + /// Returns an error if: + /// - The SQL query cannot be parsed + /// - The SQL query has multiple SQL statements + /// - The SQL query is not a `SELECT` query + pub(super) fn query(s: impl AsRef) -> Result { + let statement = Parser::parse_sql(&GenericDialect {}, s.as_ref()) + .context("invalid SQL query")? + .into_iter() + .exactly_one() + .map_err(|e| anyhow!("expected exactly one SQL statement, found {}", e.count()))?; + + let query = match statement { + ast::Statement::Query(query) => *query, + _ => bail!("invalid SQL query: only SELECT statements are allowed"), + }; + + Ok(query) + } +} diff --git a/graph/src/nozzle/sql/query/resolve_event_signatures.rs b/graph/src/nozzle/sql/query/resolve_event_signatures.rs new file mode 100644 index 00000000000..36b6a47f094 --- /dev/null +++ b/graph/src/nozzle/sql/query/resolve_event_signatures.rs @@ -0,0 +1,106 @@ +use std::ops::ControlFlow; + +use anyhow::{bail, Context, Result}; +use sqlparser_latest::ast::{self, visit_expressions_mut}; + +use super::Abi; + +static FUNCTION_NAME: &str = "sg_event_signature"; + +/// Replaces `sg_event_signature('CONTRACT_NAME', 'EVENT_NAME')` function calls with +/// the correct event signature based on `abis`. +/// +/// # Errors +/// +/// Returns an error if: +/// - The function is called with incorrect arguments +/// - The contract name is not found in `abis` +/// - The event name is not found in `abis` +/// +/// The returned error is deterministic. +pub(super) fn resolve_event_signatures(query: &mut ast::Query, abis: &[Abi<'_>]) -> Result<()> { + let visit_result = visit_expressions_mut(query, |expr| match visit_expr(expr, abis) { + Ok(()) => ControlFlow::Continue(()), + Err(e) => ControlFlow::Break(e), + }); + + if let ControlFlow::Break(e) = visit_result { + return Err(e).with_context(|| format!("failed to resolve '{FUNCTION_NAME}' calls")); + } + + Ok(()) +} + +fn visit_expr(expr: &mut ast::Expr, abis: &[Abi<'_>]) -> Result<()> { + let ast::Expr::Function(function) = expr else { + return Ok(()); + }; + + let mut ident_iter = function.name.0.iter().rev(); + let Some(ast::ObjectNamePart::Identifier(ident)) = ident_iter.next() else { + return Ok(()); + }; + + if !FUNCTION_NAME.eq_ignore_ascii_case(&ident.value) { + return Ok(()); + } + + if ident_iter.next().is_some() { + return Ok(()); + } + + let Some((contract_name, event_name)) = get_args(function) else { + bail!("invalid function call: expected `{FUNCTION_NAME}('CONTRACT_NAME', 'EVENT_NAME')`, found: `{function}`"); + }; + + let Some(event) = get_event(abis, contract_name, event_name) else { + bail!("invalid function call: unknown contract '{contract_name}' or event '{event_name}'"); + }; + + let signature = ast::Value::SingleQuotedString(event.full_signature()).with_empty_span(); + *expr = ast::Expr::Value(signature); + + Ok(()) +} + +fn get_args<'a>(function: &'a ast::Function) -> Option<(&'a str, &'a str)> { + let ast::FunctionArguments::List(args) = &function.args else { + return None; + }; + + if args.args.len() != 2 { + return None; + } + + match (get_arg(&args.args[0]), get_arg(&args.args[1])) { + (Some(contract_name), Some(event_name)) => Some((contract_name, event_name)), + _ => None, + } +} + +fn get_arg<'a>(arg: &'a ast::FunctionArg) -> Option<&'a str> { + let ast::FunctionArg::Unnamed(ast::FunctionArgExpr::Expr(expr)) = arg else { + return None; + }; + + match expr { + ast::Expr::Value(ast::ValueWithSpan { + value: ast::Value::SingleQuotedString(value), + .. + }) if !value.is_empty() => Some(value), + _ => None, + } +} + +fn get_event<'a>( + abis: &'a [Abi<'_>], + contract_name: &str, + event_name: &str, +) -> Option<&'a alloy::json_abi::Event> { + abis.iter() + .find(|abi| abi.name == contract_name) + .map(|abi| abi.contract.event(event_name)) + .flatten() + .map(|events| events.first()) + .flatten() +} diff --git a/graph/src/nozzle/sql/query/resolve_source_address.rs b/graph/src/nozzle/sql/query/resolve_source_address.rs new file mode 100644 index 00000000000..8c0d7faaccf --- /dev/null +++ b/graph/src/nozzle/sql/query/resolve_source_address.rs @@ -0,0 +1,81 @@ +use std::ops::ControlFlow; + +use alloy::primitives::Address; +use anyhow::{bail, Context, Result}; +use sqlparser_latest::ast::{self, visit_expressions_mut}; + +static FUNCTION_NAME: &str = "sg_source_address"; + +/// Replaces `sg_source_address()` function calls in the SQL query with the `source_address`. +/// +/// # Errors +/// +/// Returns an error if the function is called with any arguments. +/// +/// The returned error is deterministic. +pub(super) fn resolve_source_address( + query: &mut ast::Query, + source_address: &Address, +) -> Result<()> { + let visit_result = + visit_expressions_mut(query, |expr| match visit_expr(expr, source_address) { + Ok(()) => ControlFlow::Continue(()), + Err(e) => ControlFlow::Break(e), + }); + + if let ControlFlow::Break(e) = visit_result { + return Err(e).with_context(|| format!("failed to resolve '{FUNCTION_NAME}' calls")); + } + + Ok(()) +} + +fn visit_expr(expr: &mut ast::Expr, source_address: &Address) -> Result<()> { + let ast::Expr::Function(function) = expr else { + return Ok(()); + }; + + let mut ident_iter = function.name.0.iter().rev(); + let Some(ast::ObjectNamePart::Identifier(ident)) = ident_iter.next() else { + return Ok(()); + }; + + if !FUNCTION_NAME.eq_ignore_ascii_case(&ident.value) { + return Ok(()); + } + + if ident_iter.next().is_some() { + return Ok(()); + } + + if !matches!(function.args, ast::FunctionArguments::None) { + bail!("invalid function call: function '{FUNCTION_NAME}' does not accept arguments"); + } + + *function = ast::Function { + name: ast::ObjectName(vec![ast::ObjectNamePart::Identifier(ast::Ident::new( + "arrow_cast", + ))]), + uses_odbc_syntax: false, + parameters: ast::FunctionArguments::None, + args: ast::FunctionArguments::List(ast::FunctionArgumentList { + duplicate_treatment: None, + args: vec![ + ast::FunctionArg::Unnamed(ast::FunctionArgExpr::Expr(ast::Expr::Value( + ast::Value::HexStringLiteral(hex::encode(source_address)).with_empty_span(), + ))), + ast::FunctionArg::Unnamed(ast::FunctionArgExpr::Expr(ast::Expr::Value( + ast::Value::SingleQuotedString("FixedSizeBinary(20)".to_string()) + .with_empty_span(), + ))), + ], + clauses: vec![], + }), + filter: None, + null_treatment: None, + over: None, + within_group: vec![], + }; + + Ok(()) +} diff --git a/graph/src/nozzle/sql/query/validate_tables.rs b/graph/src/nozzle/sql/query/validate_tables.rs new file mode 100644 index 00000000000..547328f637d --- /dev/null +++ b/graph/src/nozzle/sql/query/validate_tables.rs @@ -0,0 +1,88 @@ +use std::ops::ControlFlow; + +use anyhow::{anyhow, bail, Error, Result}; +use sqlparser_latest::ast::{self, Visit, Visitor}; + +use super::format::Ident; + +/// Validates the dataset and tables used by the SQL query to ensure consistency with the explicitly declared ones. +/// +/// Checks every table reference in the SQL query and verifies that they match the `dataset` and `tables`. +/// Ignores table references not in `namespace.table` format as they may reference CTEs. +/// +/// # Errors +/// +/// Returns an error if: +/// - The SQL query references a dataset that is not equal to `dataset` +/// - The SQL query references a table that is not in the `tables` list +/// +/// The returned error is deterministic. +pub(super) fn validate_tables(query: &ast::Query, dataset: &Ident, tables: &[Ident]) -> Result<()> { + let mut table_validator = TableValidator { dataset, tables }; + if let ControlFlow::Break(e) = Visit::visit(query, &mut table_validator) { + return Err(e); + } + Ok(()) +} + +/// Walks the SQL AST and validates every table reference. +struct TableValidator<'a> { + dataset: &'a Ident, + tables: &'a [Ident], +} + +impl<'a> TableValidator<'a> { + /// Validates that the `table_factor` references the explicitly declared dataset and tables. + /// + /// Ignores unrelated table factors and table references without a namespace as they may reference CTEs. + /// + /// # Errors + /// + /// Returns an error if: + /// - The `table_factor` references a dataset that is not equal to `dataset` + /// - The `table_factor` references a table that is not in the `tables` list + /// + /// The returned error is deterministic. + fn visit_table_factor(&self, table_factor: &ast::TableFactor) -> Result<()> { + let ast::TableFactor::Table { name, .. } = table_factor else { + return Ok(()); + }; + + let mut ident_iter = name.0.iter().rev().map(|part| match part { + ast::ObjectNamePart::Identifier(ident) => Ident::new(ident.value.as_str()), + }); + + let Some(table) = ident_iter.next() else { + return Ok(()); + }; + + let Some(dataset) = ident_iter.next() else { + return Ok(()); + }; + + if *self.dataset != dataset { + bail!("'{name}': invalid dataset '{dataset}'"); + } + + if !self.tables.iter().any(|t| *t == table) { + bail!("'{name}': invalid table '{table}'"); + } + + Ok(()) + } +} + +impl<'a> Visitor for TableValidator<'a> { + type Break = Error; + + fn post_visit_table_factor( + &mut self, + table_factor: &ast::TableFactor, + ) -> ControlFlow { + if let Err(e) = self.visit_table_factor(table_factor) { + return ControlFlow::Break(anyhow!("failed to validate table {e:#}")); + } + + ControlFlow::Continue(()) + } +} From 6429c19b409b415673a43dbc51c05505b120fddd Mon Sep 17 00:00:00 2001 From: Ion Suman <47307091+isum@users.noreply.github.com> Date: Thu, 25 Sep 2025 16:58:07 +0300 Subject: [PATCH 05/40] feat(graph): use a new identifier type in Nozzle related modules --- graph/src/nozzle/codec/mod.rs | 21 +-- graph/src/nozzle/codec/name_cache.rs | 71 ++----- graph/src/nozzle/common/ident.rs | 174 ++++++++++++++++++ graph/src/nozzle/common/mod.rs | 4 + graph/src/nozzle/sql/query/filter_blocks.rs | 42 +++-- graph/src/nozzle/sql/query/mod.rs | 47 ++--- .../sql/query/resolve_event_signatures.rs | 2 +- graph/src/nozzle/sql/query/validate_tables.rs | 5 +- 8 files changed, 242 insertions(+), 124 deletions(-) create mode 100644 graph/src/nozzle/common/ident.rs diff --git a/graph/src/nozzle/codec/mod.rs b/graph/src/nozzle/codec/mod.rs index 3cb55e71227..185a0978c92 100644 --- a/graph/src/nozzle/codec/mod.rs +++ b/graph/src/nozzle/codec/mod.rs @@ -11,11 +11,8 @@ use anyhow::{anyhow, bail, Context, Result}; use arrow::array::{Array, RecordBatch}; use self::{ - array_decoder::ArrayDecoder, - decoder::Decoder, - list_decoder::ListDecoder, - mapping_decoder::MappingDecoder, - name_cache::{NameCache, NormalizedName}, + array_decoder::ArrayDecoder, decoder::Decoder, list_decoder::ListDecoder, + mapping_decoder::MappingDecoder, name_cache::NameCache, }; use crate::{ data::{ @@ -23,6 +20,7 @@ use crate::{ store::{Id, IdType, Value}, value::Word, }, + nozzle::common::Ident, schema::{EntityKey, EntityType, Field, InputSchema}, }; @@ -164,8 +162,8 @@ impl Codec { .fields() .into_iter() .zip(record_batch.columns()) - .map(|(field, array)| (self.normalized_name(field.name()), array.as_ref())) - .collect::>(); + .map(|(field, array)| Ok((self.ident(field.name())?, array.as_ref()))) + .collect::>>()?; let mut value_decoders = BTreeMap::new(); for field in &object_type.fields { @@ -195,7 +193,7 @@ impl Codec { fn value_decoder<'a>( &mut self, field: &'a Field, - columns: &HashMap, + columns: &HashMap, ) -> Result + 'a>>> { // VIDs are auto-generated if field.name.eq_ignore_ascii_case("vid") { @@ -207,7 +205,7 @@ impl Codec { return Ok(None); } - let normalized_name = self.normalized_name(&field.name); + let normalized_name = self.ident(&field.name)?; let array = match columns.get(&normalized_name) { Some(&array) => array, None => { @@ -231,8 +229,7 @@ impl Codec { Ok(Some(decoder)) } - // Returns a normalized version of `name`. - fn normalized_name(&mut self, name: impl AsRef) -> NormalizedName { - self.name_cache.normalized(name.as_ref()) + fn ident(&mut self, name: impl AsRef) -> Result { + self.name_cache.ident(name.as_ref()) } } diff --git a/graph/src/nozzle/codec/name_cache.rs b/graph/src/nozzle/codec/name_cache.rs index 8a56a1558fc..07bf874fccc 100644 --- a/graph/src/nozzle/codec/name_cache.rs +++ b/graph/src/nozzle/codec/name_cache.rs @@ -1,23 +1,14 @@ -use std::{collections::HashMap, sync::Arc}; +use std::collections::HashMap; -use heck::ToSnakeCase; +use anyhow::Result; -use crate::{cheap_clone::CheapClone, derive::CheapClone}; +use crate::{cheap_clone::CheapClone, nozzle::common::Ident}; -/// Provides case-insensitive string comparison through normalization. -/// -/// Normalizes names and stores them in memory for fast access. +/// Caches identifiers that are used to match Arrow columns and Subgraph entity fields. pub(super) struct NameCache { - cache: HashMap, NormalizedName>, + cache: HashMap, Ident>, } -/// Contains a normalized name. -/// -/// A normalized name is a list of lowercase tokens from the original name. -/// A token is a sequence of characters between case format separators. -#[derive(Debug, Clone, CheapClone, PartialEq, Eq, PartialOrd, Ord, Hash)] -pub(super) struct NormalizedName(Arc<[Box]>); - impl NameCache { /// Creates a new empty cache. pub(super) fn new() -> Self { @@ -26,52 +17,18 @@ impl NameCache { } } - /// Takes a `name` with any case format and returns a normalized version. - /// - /// A normalized version is a list of lowercase tokens from the original input. - /// A token is a sequence of characters between case format separators. - /// - /// # Example - /// - /// ```no_run - /// let mut name_cache = NameCache::new(); + /// Returns the identifier for the given name. /// - /// assert_eq!( - /// name_cache.normalized("blockNumber"), - /// vec!["block".into(), "number".into()].into(), - /// ); - /// assert_eq!( - /// name_cache.normalized("block number"), - /// vec!["block".into(), "number".into()].into(), - /// ); - /// assert_eq!( - /// name_cache.normalized("block_number"), - /// vec!["block".into(), "number".into()].into(), - /// ); - /// ``` - pub(super) fn normalized(&mut self, name: &str) -> NormalizedName { - if let Some(normalized_name) = self.cache.get(name) { - return normalized_name.cheap_clone(); + /// If the identifier exists in the cache, returns the cached version. + /// Otherwise, creates a new identifier, caches it, and returns it. + pub(super) fn ident(&mut self, name: &str) -> Result { + if let Some(ident) = self.cache.get(name) { + return Ok(ident.cheap_clone()); } - let normalized_name = NormalizedName::new(name); - - self.cache - .insert(name.into(), normalized_name.cheap_clone()); - - normalized_name - } -} + let ident = Ident::new(name)?; + self.cache.insert(name.into(), ident.cheap_clone()); -impl NormalizedName { - /// Creates a normalized name from the input string. - fn new(name: &str) -> Self { - Self( - name.to_snake_case() - .split('_') - .map(Into::into) - .collect::>() - .into(), - ) + Ok(ident) } } diff --git a/graph/src/nozzle/common/ident.rs b/graph/src/nozzle/common/ident.rs new file mode 100644 index 00000000000..328140c77af --- /dev/null +++ b/graph/src/nozzle/common/ident.rs @@ -0,0 +1,174 @@ +use std::{ + cmp::Ordering, + fmt, + hash::{Hash, Hasher}, + sync::Arc, +}; + +use anyhow::{bail, Result}; +use heck::{ToLowerCamelCase, ToSnakeCase, ToUpperCamelCase}; +use lazy_regex::regex_is_match; + +use crate::derive::CheapClone; + +/// Represents a valid identifier that can be used for SQL table names, SQL column names, +/// entity names and entity fields. +/// +/// Validates and tokenizes an identifier to allow case-insensitive and format-insensitive +/// comparison between multiple identifiers. +/// +/// Maintains the original identifier for cases when the exact format is required after comparisons. +/// +/// # Example +/// +/// ```rust +/// # use graph::nozzle::common::Ident; +/// +/// assert_eq!(Ident::new("block_hash").unwrap(), Ident::new("blockHash").unwrap()); +/// assert_eq!(Ident::new("block-hash").unwrap(), Ident::new("BlockHash").unwrap()); +/// ``` +#[derive(Debug, Clone, CheapClone, PartialEq, Eq, PartialOrd, Ord, Hash)] +pub struct Ident(Arc); + +impl Ident { + /// Creates a new identifier. + /// + /// Validates and tokenizes an identifier to allow case-insensitive and format-insensitive + /// comparison between multiple identifiers. + /// + /// # Errors + /// + /// Returns an error if: + /// - The input string `s` does not start with a letter or an underscore + /// - The input string `s` does not contain only letters, numbers, hyphens, and underscores + /// - The input string `s` contains more than 100 characters + /// + /// The returned error is deterministic. + pub fn new(s: impl AsRef) -> Result { + let raw = s.as_ref(); + + if !regex_is_match!("^[a-zA-Z_][a-zA-Z0-9_-]{0,100}$", raw) { + bail!("invalid identifier '{raw}': must start with a letter or an underscore, and contain only letters, numbers, hyphens, and underscores"); + } + + Ok(Self(Arc::new(Inner::new(raw)))) + } + + /// Returns a reference to the original string used to create this identifier. + /// + /// # Example + /// + /// ```rust + /// # use graph::nozzle::common::Ident; + /// + /// let ident = Ident::new("BLOCK_hash").unwrap(); + /// assert_eq!(ident.as_str(), "BLOCK_hash"); + /// ``` + pub fn as_str(&self) -> &str { + &self.0.raw + } + + /// Returns the tokens of this identifier that are used for case-insensitive and format-insensitive comparison. + /// + /// A token is a sequence of lowercase characters between case format separators. + /// + /// # Example + /// + /// ```rust + /// # use graph::nozzle::common::Ident; + /// + /// let ident = Ident::new("blockHash").unwrap(); + /// assert_eq!(ident.tokens(), &["block".into(), "hash".into()]); + /// ``` + pub fn tokens(&self) -> &[Box] { + &self.0.tokens + } + + /// Converts this identifier to `lowerCamelCase` format. + /// + /// # Example + /// + /// ```rust + /// # use graph::nozzle::common::Ident; + /// + /// let ident = Ident::new("block_hash").unwrap(); + /// assert_eq!(ident.to_lower_camel_case(), "blockHash"); + /// ``` + pub fn to_lower_camel_case(&self) -> String { + self.0.raw.to_lower_camel_case() + } + + /// Converts this identifier to `UpperCamelCase` format. + /// + /// # Example + /// + /// ```rust + /// # use graph::nozzle::common::Ident; + /// + /// let ident = Ident::new("block_hash").unwrap(); + /// assert_eq!(ident.to_upper_camel_case(), "BlockHash"); + /// ``` + pub fn to_upper_camel_case(&self) -> String { + self.0.raw.to_upper_camel_case() + } +} + +impl fmt::Display for Ident { + fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result { + write!(f, "{}", self.0.raw) + } +} + +/// The internal representation of an identifier. +#[derive(Debug)] +struct Inner { + /// The original unmodified string used to create the identifier. + raw: Box, + + /// The tokens of the identifier that are used for case-insensitive + /// and format-insensitive comparison. + tokens: Box<[Box]>, +} + +impl Inner { + /// Creates a new internal representation of an identifier. + fn new(raw: &str) -> Self { + let tokens = raw + .to_snake_case() + .split('_') + .map(Into::into) + .collect::>() + .into(); + + Self { + raw: raw.into(), + tokens, + } + } +} + +impl PartialEq for Inner { + fn eq(&self, other: &Self) -> bool { + self.tokens == other.tokens + } +} + +impl Eq for Inner {} + +impl PartialOrd for Inner { + fn partial_cmp(&self, other: &Self) -> Option { + Some(self.tokens.cmp(&other.tokens)) + } +} + +impl Ord for Inner { + fn cmp(&self, other: &Self) -> Ordering { + self.tokens.cmp(&other.tokens) + } +} + +impl Hash for Inner { + fn hash(&self, state: &mut H) { + self.tokens.hash(state); + } +} diff --git a/graph/src/nozzle/common/mod.rs b/graph/src/nozzle/common/mod.rs index f4fadb33d08..7236bcdbdd4 100644 --- a/graph/src/nozzle/common/mod.rs +++ b/graph/src/nozzle/common/mod.rs @@ -1,3 +1,7 @@ +mod ident; + +pub use self::ident::Ident; + pub(super) mod column_aliases { pub(in crate::nozzle) static BLOCK_NUMBER: &[&str] = &["_block_num", "block_num"]; pub(in crate::nozzle) static BLOCK_HASH: &[&str] = &["hash", "block_hash"]; diff --git a/graph/src/nozzle/sql/query/filter_blocks.rs b/graph/src/nozzle/sql/query/filter_blocks.rs index e36ccf4ee51..78e29eaca58 100644 --- a/graph/src/nozzle/sql/query/filter_blocks.rs +++ b/graph/src/nozzle/sql/query/filter_blocks.rs @@ -3,7 +3,8 @@ use std::{collections::BTreeMap, ops::ControlFlow}; use alloy::primitives::BlockNumber; use sqlparser_latest::ast::{self, VisitMut, VisitorMut}; -use super::{format::Ident, parse}; +use super::parse; +use crate::{cheap_clone::CheapClone, nozzle::common::Ident}; /// Applies a block range filter to the SQL query. /// @@ -23,7 +24,7 @@ pub(super) fn filter_blocks( ) { let tables_to_cte_mapping = tables_to_cte_mapping(dataset, tables); - let mut table_to_cte_replacer = TableToCteReplacer::new(&tables_to_cte_mapping); + let mut table_to_cte_replacer = TableToCteReplacer::new(dataset, &tables_to_cte_mapping); let _: ControlFlow<()> = VisitMut::visit(query, &mut table_to_cte_replacer); match &mut query.with { @@ -32,6 +33,7 @@ pub(super) fn filter_blocks( add_cte_filters( &mut with.cte_tables, + dataset, &tables_to_cte_mapping, start_block, end_block, @@ -42,6 +44,7 @@ pub(super) fn filter_blocks( add_cte_filters( &mut cte_tables, + dataset, &tables_to_cte_mapping, start_block, end_block, @@ -57,20 +60,15 @@ pub(super) fn filter_blocks( } // Maps `dataset` and `tables` to consistent names for temporary result sets. -fn tables_to_cte_mapping(dataset: &Ident, tables: &[Ident]) -> BTreeMap { +fn tables_to_cte_mapping(dataset: &Ident, tables: &[Ident]) -> BTreeMap { tables .into_iter() - .map(|table| { - let dataset_table = format!("{dataset}.{table}"); - let cte_table = format!("sg_{dataset}_{table}"); - - (dataset_table, cte_table) - }) + .map(|table| (table.cheap_clone(), format!("sg_{dataset}_{table}"))) .collect() } /// Removes previously added temporary result sets from the SQL query. -fn remove_cte_filters(ctes: &mut Vec, tables_to_cte_mapping: &BTreeMap) { +fn remove_cte_filters(ctes: &mut Vec, tables_to_cte_mapping: &BTreeMap) { ctes.retain(|cte| { !tables_to_cte_mapping .values() @@ -81,7 +79,8 @@ fn remove_cte_filters(ctes: &mut Vec, tables_to_cte_mapping: &BTreeMap /// Creates temporary result sets for each table in the dataset and adds them to the SQL query. fn add_cte_filters( ctes: &mut Vec, - tables_to_cte_mapping: &BTreeMap, + dataset: &Ident, + tables_to_cte_mapping: &BTreeMap, start_block: BlockNumber, end_block: BlockNumber, ) { @@ -89,7 +88,7 @@ fn add_cte_filters( for (table, cte_table) in tables_to_cte_mapping { let query = parse::query(format!( - "SELECT * FROM {table} WHERE _block_num BETWEEN {start_block} AND {end_block} ORDER BY _block_num ASC" + "SELECT * FROM {dataset}.{table} WHERE _block_num BETWEEN {start_block} AND {end_block} ORDER BY _block_num ASC" )) .unwrap(); @@ -111,13 +110,15 @@ fn add_cte_filters( /// Walks the SQL AST and replaces each table reference with a temporary result set name. struct TableToCteReplacer<'a> { - tables_to_cte_mapping: &'a BTreeMap, + dataset: &'a Ident, + tables_to_cte_mapping: &'a BTreeMap, } impl<'a> TableToCteReplacer<'a> { /// Creates a new replacer. - fn new(tables_to_cte_mapping: &'a BTreeMap) -> Self { + fn new(dataset: &'a Ident, tables_to_cte_mapping: &'a BTreeMap) -> Self { Self { + dataset, tables_to_cte_mapping, } } @@ -143,14 +144,21 @@ impl<'a> TableToCteReplacer<'a> { return; }; - let dataset_table = format!("{}.{}", Ident::new(dataset), Ident::new(table)); - let Some(cte_table) = self.tables_to_cte_mapping.get(&dataset_table) else { + let (Ok(dataset), Ok(table)) = (Ident::new(dataset), Ident::new(table)) else { + return; + }; + + if *self.dataset != dataset { + return; + } + + let Some(cte_table) = self.tables_to_cte_mapping.get(&table) else { return; }; if alias.is_none() { *alias = Some(ast::TableAlias { - name: ast::Ident::new(table), + name: ast::Ident::new(table.as_str()), columns: Vec::new(), }) } diff --git a/graph/src/nozzle/sql/query/mod.rs b/graph/src/nozzle/sql/query/mod.rs index f13c506afa2..8f9a928609b 100644 --- a/graph/src/nozzle/sql/query/mod.rs +++ b/graph/src/nozzle/sql/query/mod.rs @@ -13,6 +13,8 @@ use anyhow::{bail, Context, Result}; use itertools::Itertools; use sqlparser_latest::ast; +use crate::{cheap_clone::CheapClone, nozzle::common::Ident}; + /// Represents a valid SQL query of a Nozzle Subgraph. /// /// Parses, validates and resolves a SQL query and prepares it for execution on a Nozzle server. @@ -22,16 +24,16 @@ pub struct Query { ast: ast::Query, /// The dataset that the SQL query requests data from. - dataset: format::Ident, + dataset: Ident, /// The tables that the SQL query requests data from. - tables: Vec, + tables: Vec, } /// Contains the ABI information that is used to resolve event signatures in SQL queries. pub struct Abi<'a> { /// The name of the contract. - pub name: &'a str, + pub name: &'a Ident, /// The JSON ABI of the contract. pub contract: &'a JsonAbi, @@ -50,23 +52,21 @@ impl Query { /// The returned error is deterministic. pub fn new<'a>( sql: impl AsRef, - dataset: impl AsRef, - tables: impl IntoIterator>, + dataset: &Ident, + tables: &[Ident], source_address: &Address, abis: impl IntoIterator>, ) -> Result { let mut query = parse::query(sql).context("failed to parse SQL query")?; - let dataset = format::Ident::new(dataset); - let tables = tables.into_iter().map(format::Ident::new).collect_vec(); let abis = abis.into_iter().collect_vec(); - Self::validate(&query, &dataset, &tables).context("failed to validate SQL query")?; + Self::validate(&query, dataset, tables).context("failed to validate SQL query")?; Self::resolve(&mut query, source_address, &abis).context("failed to resolve SQL query")?; Ok(Self { ast: query, - dataset, - tables, + dataset: dataset.cheap_clone(), + tables: tables.to_vec(), }) } @@ -98,11 +98,7 @@ impl Query { /// - The SQL query uses custom `SETTINGS` /// /// The returned error is deterministic. - fn validate( - query: &ast::Query, - dataset: &format::Ident, - tables: &[format::Ident], - ) -> Result<()> { + fn validate(query: &ast::Query, dataset: &Ident, tables: &[Ident]) -> Result<()> { validate_tables::validate_tables(query, dataset, tables)?; if query.settings.is_some() { @@ -135,27 +131,6 @@ impl fmt::Display for Query { } } -mod format { - use std::fmt; - - /// Represents a normalized SQL identifier. - #[derive(Debug, Clone, PartialEq, Eq, PartialOrd, Ord, Hash)] - pub(super) struct Ident(Box); - - impl Ident { - /// Creates a normalized SQL identifier. - pub(super) fn new(s: impl AsRef) -> Self { - Self(s.as_ref().to_lowercase().into()) - } - } - - impl fmt::Display for Ident { - fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result { - write!(f, "{}", self.0) - } - } -} - mod parse { use anyhow::{anyhow, bail, Context, Result}; use itertools::Itertools; diff --git a/graph/src/nozzle/sql/query/resolve_event_signatures.rs b/graph/src/nozzle/sql/query/resolve_event_signatures.rs index 36b6a47f094..a243cd12268 100644 --- a/graph/src/nozzle/sql/query/resolve_event_signatures.rs +++ b/graph/src/nozzle/sql/query/resolve_event_signatures.rs @@ -98,7 +98,7 @@ fn get_event<'a>( event_name: &str, ) -> Option<&'a alloy::json_abi::Event> { abis.iter() - .find(|abi| abi.name == contract_name) + .find(|abi| abi.name.as_str() == contract_name) .map(|abi| abi.contract.event(event_name)) .flatten() .map(|events| events.first()) diff --git a/graph/src/nozzle/sql/query/validate_tables.rs b/graph/src/nozzle/sql/query/validate_tables.rs index 547328f637d..429d60a4863 100644 --- a/graph/src/nozzle/sql/query/validate_tables.rs +++ b/graph/src/nozzle/sql/query/validate_tables.rs @@ -3,7 +3,7 @@ use std::ops::ControlFlow; use anyhow::{anyhow, bail, Error, Result}; use sqlparser_latest::ast::{self, Visit, Visitor}; -use super::format::Ident; +use crate::nozzle::common::Ident; /// Validates the dataset and tables used by the SQL query to ensure consistency with the explicitly declared ones. /// @@ -60,6 +60,9 @@ impl<'a> TableValidator<'a> { return Ok(()); }; + let table = table?; + let dataset = dataset?; + if *self.dataset != dataset { bail!("'{name}': invalid dataset '{dataset}'"); } From 0e8314baf423f1104149f31ca2b713601d61dd6b Mon Sep 17 00:00:00 2001 From: Ion Suman <47307091+isum@users.noreply.github.com> Date: Thu, 25 Sep 2025 16:58:07 +0300 Subject: [PATCH 06/40] feat(graph): add Nozzle Subgraph schema generation --- graph/src/nozzle/mod.rs | 1 + graph/src/nozzle/schema/generator/entity.rs | 167 ++++++++++++++++++++ graph/src/nozzle/schema/generator/mod.rs | 69 ++++++++ graph/src/nozzle/schema/mod.rs | 3 + 4 files changed, 240 insertions(+) create mode 100644 graph/src/nozzle/schema/generator/entity.rs create mode 100644 graph/src/nozzle/schema/generator/mod.rs create mode 100644 graph/src/nozzle/schema/mod.rs diff --git a/graph/src/nozzle/mod.rs b/graph/src/nozzle/mod.rs index d3fbe063721..c3d579e3464 100644 --- a/graph/src/nozzle/mod.rs +++ b/graph/src/nozzle/mod.rs @@ -5,6 +5,7 @@ pub mod codec; pub mod common; pub mod error; pub mod log; +pub mod schema; pub mod sql; pub mod stream_aggregator; diff --git a/graph/src/nozzle/schema/generator/entity.rs b/graph/src/nozzle/schema/generator/entity.rs new file mode 100644 index 00000000000..d53249ce948 --- /dev/null +++ b/graph/src/nozzle/schema/generator/entity.rs @@ -0,0 +1,167 @@ +use std::fmt; + +use anyhow::{bail, Context, Result}; + +use crate::{cheap_clone::CheapClone, data::store::ValueType, nozzle::common::Ident}; + +/// A minimal representation of a Subgraph entity. +pub(super) struct Entity { + name: Ident, + fields: Vec, +} + +impl Entity { + /// Converts the Arrow schema to a Subgraph entity. + /// + /// # Errors + /// + /// Returns an error if Arrow fields cannot be converted to Subgraph entity fields. + /// + /// The returned error is deterministic. + pub(super) fn new(name: Ident, arrow_schema: arrow::datatypes::Schema) -> Result { + let mut fields = arrow_schema + .fields() + .iter() + .map(|field| { + Field::new(field) + .with_context(|| format!("failed to create field '{}'", field.name())) + }) + .collect::, _>>()?; + + if !fields.iter().any(|field| field.name.as_str() == "id") { + fields.push(Field::id()); + } + + fields.sort_unstable_by_key(|field| field.name.cheap_clone()); + + Ok(Self { name, fields }) + } +} + +impl fmt::Display for Entity { + fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result { + write! {f, "type {} @entity(immutable: true)", self.name.to_upper_camel_case()}?; + write! {f, " {{\n"}?; + for field in &self.fields { + write! {f, "\t{field}\n"}?; + } + write! {f, "}}"} + } +} + +/// A minimal representation of a Subgraph entity field. +struct Field { + name: Ident, + value_type: ValueType, + is_list: bool, + is_required: bool, +} + +impl Field { + /// Converts the Arrow field to a Subgraph entity field. + /// + /// # Errors + /// + /// Returns an error if: + /// - The Arrow field has an invalid name + /// - The Arrow field type cannot be converted to a Subgraph entity value type + /// + /// The returned error is deterministic. + fn new(arrow_field: &arrow::datatypes::Field) -> Result { + let name = Ident::new(arrow_field.name())?; + let (value_type, is_list) = arrow_data_type_to_value_type(arrow_field.data_type())?; + let is_required = !arrow_field.is_nullable(); + + Ok(Self { + name, + value_type, + is_list, + is_required, + }) + } + + /// Creates an `ID` Subgraph entity field. + fn id() -> Self { + Self { + name: Ident::new("id").unwrap(), + value_type: ValueType::Bytes, + is_list: false, + is_required: true, + } + } +} + +impl fmt::Display for Field { + fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result { + write! {f, "{}: ", self.name.to_lower_camel_case()}?; + if self.is_list { + write! {f, "["}?; + } + write! {f, "{}", self.value_type.to_str()}?; + if self.is_list { + write! {f, "]"}?; + } + if self.is_required { + write! {f, "!"}?; + } + Ok(()) + } +} + +fn arrow_data_type_to_value_type( + data_type: &arrow::datatypes::DataType, +) -> Result<(ValueType, bool)> { + use arrow::datatypes::DataType::*; + + let type_not_supported = || bail!("type '{data_type}' not supported"); + let value_type = match data_type { + Null => return type_not_supported(), + Boolean => ValueType::Boolean, + Int8 => ValueType::Int, + Int16 => ValueType::Int, + Int32 => ValueType::Int, + Int64 => ValueType::Int8, + UInt8 => ValueType::Int, + UInt16 => ValueType::Int, + UInt32 => ValueType::Int8, + UInt64 => ValueType::BigInt, + Float16 => ValueType::BigDecimal, + Float32 => ValueType::BigDecimal, + Float64 => ValueType::BigDecimal, + Timestamp(_, _) => ValueType::Timestamp, + Date32 => ValueType::Timestamp, + Date64 => ValueType::Timestamp, + Time32(_) => return type_not_supported(), + Time64(_) => return type_not_supported(), + Duration(_) => return type_not_supported(), + Interval(_) => return type_not_supported(), + Binary => ValueType::Bytes, + FixedSizeBinary(_) => ValueType::Bytes, + LargeBinary => ValueType::Bytes, + BinaryView => ValueType::Bytes, + Utf8 => ValueType::String, + LargeUtf8 => ValueType::String, + Utf8View => ValueType::String, + List(field) + | ListView(field) + | FixedSizeList(field, _) + | LargeList(field) + | LargeListView(field) => { + if field.data_type().is_nested() { + return type_not_supported(); + } + + return arrow_data_type_to_value_type(field.data_type()) + .map(|(value_type, _)| (value_type, true)); + } + Struct(_) => return type_not_supported(), + Union(_, _) => return type_not_supported(), + Dictionary(_, _) => return type_not_supported(), + Decimal128(_, _) => ValueType::BigDecimal, + Decimal256(_, _) => ValueType::BigDecimal, + Map(_, _) => return type_not_supported(), + RunEndEncoded(_, _) => return type_not_supported(), + }; + + Ok((value_type, false)) +} diff --git a/graph/src/nozzle/schema/generator/mod.rs b/graph/src/nozzle/schema/generator/mod.rs new file mode 100644 index 00000000000..fcdef01d970 --- /dev/null +++ b/graph/src/nozzle/schema/generator/mod.rs @@ -0,0 +1,69 @@ +mod entity; + +use anyhow::{Context, Result}; +use arrow::datatypes::Schema; +use itertools::Itertools; + +use self::entity::Entity; +use crate::{ + cheap_clone::CheapClone, data::subgraph::DeploymentHash, nozzle::common::Ident, + schema::InputSchema, +}; + +/// Generates a Subgraph schema from a list of Arrow schemas. +/// +/// # Limitations +/// +/// The generated Subgraph entities are immutable and do not contain any relationships to other entities within the schema. +/// +/// # Errors +/// +/// Returns an error if any of the Arrow schemas cannot be represented as valid Subgraph entities. +/// +/// The returned error is deterministic. +pub fn generate_subgraph_schema( + deployment_hash: &DeploymentHash, + queries: impl IntoIterator, +) -> Result { + let mut queries = merge_related_queries(queries)?; + queries.sort_unstable_by_key(|(name, _)| name.cheap_clone()); + + let entities = create_entities(queries)?; + let mut subgraph_schema = String::new(); + + for entity in entities { + subgraph_schema.extend(std::iter::once(entity.to_string())); + subgraph_schema.push_str("\n\n"); + } + + let input_schema = InputSchema::parse_latest(&subgraph_schema, deployment_hash.to_owned()) + .context("failed to parse subgraph schema")?; + + Ok(input_schema) +} + +fn merge_related_queries( + queries: impl IntoIterator, +) -> Result> { + queries + .into_iter() + .into_group_map_by(|(name, _)| name.cheap_clone()) + .into_iter() + .map(|(name, related_queries)| { + let related_schemas = related_queries.into_iter().map(|(_, schema)| schema); + + Schema::try_merge(related_schemas).map(|schema| (name, schema)) + }) + .collect::, _>>() + .context("failed to merge schemas of related SQL queries") +} + +fn create_entities(queries: Vec<(Ident, Schema)>) -> Result> { + queries + .into_iter() + .map(|(name, schema)| { + Entity::new(name.cheap_clone(), schema) + .with_context(|| format!("failed to create entity '{}'", name)) + }) + .collect::, _>>() +} diff --git a/graph/src/nozzle/schema/mod.rs b/graph/src/nozzle/schema/mod.rs new file mode 100644 index 00000000000..546777a14ff --- /dev/null +++ b/graph/src/nozzle/schema/mod.rs @@ -0,0 +1,3 @@ +mod generator; + +pub use self::generator::generate_subgraph_schema; From 01d2596740f8e0420f238c0fba7f0c59bbd0e1ee Mon Sep 17 00:00:00 2001 From: Ion Suman <47307091+isum@users.noreply.github.com> Date: Thu, 25 Sep 2025 16:58:07 +0300 Subject: [PATCH 07/40] feat(graph): add Nozzle Subgraph manifest --- graph/src/nozzle/client/mod.rs | 2 +- graph/src/nozzle/common/mod.rs | 1 + graph/src/nozzle/manifest/data_source/mod.rs | 102 ++++ graph/src/nozzle/manifest/data_source/raw.rs | 538 ++++++++++++++++++ graph/src/nozzle/manifest/mod.rs | 21 + graph/src/nozzle/mod.rs | 2 + graph/src/nozzle/sql/query/mod.rs | 18 +- .../sql/query/resolve_event_signatures.rs | 16 +- 8 files changed, 682 insertions(+), 18 deletions(-) create mode 100644 graph/src/nozzle/manifest/data_source/mod.rs create mode 100644 graph/src/nozzle/manifest/data_source/raw.rs create mode 100644 graph/src/nozzle/manifest/mod.rs diff --git a/graph/src/nozzle/client/mod.rs b/graph/src/nozzle/client/mod.rs index 6567af255dd..1f6e7e517c0 100644 --- a/graph/src/nozzle/client/mod.rs +++ b/graph/src/nozzle/client/mod.rs @@ -10,7 +10,7 @@ use crate::nozzle::error; /// Client for connecting to Nozzle core and executing SQL queries. pub trait Client { - type Error: Error + error::IsDeterministic; + type Error: Error + error::IsDeterministic + Send + Sync + 'static; /// Executes a SQL query and returns the corresponding schema. fn schema( diff --git a/graph/src/nozzle/common/mod.rs b/graph/src/nozzle/common/mod.rs index 7236bcdbdd4..9075a8edbea 100644 --- a/graph/src/nozzle/common/mod.rs +++ b/graph/src/nozzle/common/mod.rs @@ -5,4 +5,5 @@ pub use self::ident::Ident; pub(super) mod column_aliases { pub(in crate::nozzle) static BLOCK_NUMBER: &[&str] = &["_block_num", "block_num"]; pub(in crate::nozzle) static BLOCK_HASH: &[&str] = &["hash", "block_hash"]; + pub(in crate::nozzle) static BLOCK_TIMESTAMP: &[&str] = &["timestamp"]; } diff --git a/graph/src/nozzle/manifest/data_source/mod.rs b/graph/src/nozzle/manifest/data_source/mod.rs new file mode 100644 index 00000000000..121ca11f88f --- /dev/null +++ b/graph/src/nozzle/manifest/data_source/mod.rs @@ -0,0 +1,102 @@ +pub mod raw; + +use alloy::{ + json_abi::JsonAbi, + primitives::{Address, BlockNumber}, +}; +use arrow::datatypes::Schema; + +use crate::nozzle::{common::Ident, sql::Query}; + +/// Represents a valid data source of a Nozzle Subgraph. +/// +/// This data source contains parsed, formatted, and resolved data. +#[derive(Debug, Clone)] +pub struct DataSource { + /// The name of the data source. + /// + /// Used for observability to identify progress and errors produced by this data source. + pub name: Ident, + + /// Contains the sources used by this data source. + pub source: Source, + + /// Contains the transformations of source tables indexed by the Subgraph. + pub transformer: Transformer, +} + +impl DataSource { + pub const KIND: &str = "nozzle"; +} + +/// Contains the sources that a data source uses. +#[derive(Debug, Clone)] +pub struct Source { + /// The dataset from which SQL queries in the data source can query. + pub dataset: Ident, + + /// The tables from which SQL queries in the data source can query. + pub tables: Vec, + + /// The contract address with which SQL queries in the data source interact. + /// + /// This address enables SQL query reuse through `sg_source_address()` calls instead of hard-coding the contract address. + /// The `sg_source_address()` calls in SQL queries of the data source resolve to this contract address. + /// + /// SQL queries are not limited to using only this contract address. + /// + /// Defaults to an empty contract address. + pub address: Address, + + /// The minimum block number that SQL queries in the data source can query. + /// + /// Defaults to the minimum possible block number. + pub start_block: BlockNumber, + + /// The maximum block number that SQL queries in the data source can query. + /// + /// Defaults to the maximum possible block number. + pub end_block: BlockNumber, +} + +/// Contains the transformations of source tables indexed by the Subgraph. +#[derive(Debug, Clone)] +pub struct Transformer { + /// The ABIs that SQL queries can reference to extract event signatures. + /// + /// The `sg_event_signature('CONTRACT_NAME', 'EVENT_NAME')` calls in the + /// SQL queries resolve to a full event signature based on this list. + pub abis: Vec, + + /// The transformed tables that extract data from source tables for indexing. + pub tables: Vec, +} + +/// Represents an ABI of a smart contract. +#[derive(Debug, Clone)] +pub struct Abi { + /// The name of the contract. + pub name: Ident, + + /// The JSON ABI of the contract. + pub contract: JsonAbi, +} + +/// Represents a transformed table that extracts data from source tables for indexing. +#[derive(Debug, Clone)] +pub struct Table { + /// The name of the transformed table. + /// + /// Must reference a valid entity name from the Subgraph schema. + pub name: Ident, + + /// The SQL query that executes on the Nozzle server. + /// + /// The data resulting from this SQL query execution transforms into Subgraph entities. + pub query: Query, + + /// The Arrow schema of this transformed table SQL query. + /// + /// This schema loads from the Nozzle server. + pub schema: Schema, +} diff --git a/graph/src/nozzle/manifest/data_source/raw.rs b/graph/src/nozzle/manifest/data_source/raw.rs new file mode 100644 index 00000000000..a36994108e5 --- /dev/null +++ b/graph/src/nozzle/manifest/data_source/raw.rs @@ -0,0 +1,538 @@ +use std::{collections::HashSet, sync::LazyLock}; + +use alloy::{ + json_abi::JsonAbi, + primitives::{Address, BlockNumber}, +}; +use anyhow::anyhow; +use arrow::datatypes::Schema; +use futures03::future::try_join_all; +use semver::Version; +use serde::Deserialize; +use slog::Logger; +use thiserror::Error; + +use super::{Abi, DataSource, Source, Table, Transformer}; +use crate::{ + components::link_resolver::LinkResolver, + nozzle::{ + self, + common::{column_aliases, Ident}, + error::IsDeterministic, + sql::Query, + }, +}; + +/// Supported API versions for data source transformers. +static API_VERSIONS: LazyLock> = + LazyLock::new(|| HashSet::from([Version::new(0, 0, 1)])); + +/// Represents an unmodified input data source of a Nozzle Subgraph. +/// +/// May contain invalid or partial data. +#[derive(Debug, Clone, Deserialize)] +#[serde(rename_all = "camelCase")] +pub struct RawDataSource { + /// The name of the data source. + /// + /// Must be a valid, non-empty identifier with no spaces or special characters. + pub name: String, + + /// The kind of the data source. + /// + /// Must be equal to `nozzle`. + pub kind: String, + + /// Contains sources used by this data source. + pub source: RawSource, + + /// Contains transformations of source tables indexed by the Subgraph. + pub transformer: RawTransformer, +} + +impl RawDataSource { + /// Parses, formats, and resolves the input data source into a valid data source. + pub async fn resolve( + self, + logger: &Logger, + link_resolver: &dyn LinkResolver, + nozzle_client: &impl nozzle::Client, + ) -> Result { + let Self { + name, + kind, + source, + transformer, + } = self; + + let name = Self::resolve_name(name)?; + Self::resolve_kind(kind)?; + + let source = source + .resolve() + .map_err(|e| e.source_context("invalid `source`"))?; + + let transformer = transformer + .resolve(logger, link_resolver, nozzle_client, &source) + .await + .map_err(|e| e.source_context("invalid `transformer`"))?; + + Ok(DataSource { + name, + source, + transformer, + }) + } + + fn resolve_name(name: String) -> Result { + Ident::new(name).map_err(|e| Error::InvalidValue(e.context("invalid `name`"))) + } + + fn resolve_kind(kind: String) -> Result<(), Error> { + if !kind.eq_ignore_ascii_case(DataSource::KIND) { + return Err(Error::InvalidValue(anyhow!("invalid `kind`"))); + } + + Ok(()) + } +} + +/// Contains an unmodified input source used by the data source. +/// +/// May contain invalid or partial data. +#[derive(Debug, Clone, Deserialize)] +#[serde(rename_all = "camelCase")] +pub struct RawSource { + /// The dataset that SQL queries in the data source can query. + /// + /// Must reference a valid dataset name from the Nozzle server. + pub dataset: String, + + /// The tables that SQL queries in the data source can query. + /// + /// Must reference valid table names of the dataset from the Nozzle server. + pub tables: Vec, + + /// The contract address used by SQL queries in the data source. + /// + /// Enables SQL query reuse through `sg_source_address()` calls instead of hard-coding the contract address. + /// SQL queries resolve `sg_source_address()` calls to this contract address. + pub address: Option
, + + /// The minimum block number that SQL queries in the data source can query. + pub start_block: Option, + + /// The maximum block number that SQL queries in the data source can query. + pub end_block: Option, +} + +impl RawSource { + /// Parses, formats, and resolves the input source into a valid source. + fn resolve(self) -> Result { + let Self { + dataset, + tables, + address, + start_block, + end_block, + } = self; + let dataset = Self::resolve_dataset(dataset)?; + let tables = Self::resolve_tables(tables)?; + let address = address.unwrap_or(Address::ZERO); + let start_block = start_block.unwrap_or(BlockNumber::MIN); + let end_block = end_block.unwrap_or(BlockNumber::MAX); + + if start_block >= end_block { + return Err(Error::InvalidValue(anyhow!( + "`end_block` must be greater than `start_block`" + ))); + } + + Ok(Source { + dataset, + tables, + address, + start_block, + end_block, + }) + } + + fn resolve_dataset(dataset: String) -> Result { + Ident::new(dataset).map_err(|e| Error::InvalidValue(e.context("invalid `dataset`"))) + } + + fn resolve_tables(tables: Vec) -> Result, Error> { + const MAX_TABLES: usize = 100; + + if tables.is_empty() { + return Err(Error::InvalidValue(anyhow!("`tables` cannot be empty"))); + } + + if tables.len() > MAX_TABLES { + return Err(Error::InvalidValue(anyhow!( + "`tables` cannot have more than {MAX_TABLES} tables" + ))); + } + + tables + .into_iter() + .enumerate() + .map(|(i, table)| { + Ident::new(table).map_err(|e| { + Error::InvalidValue(e.context(format!("invalid `tables` at index {i}"))) + }) + }) + .collect() + } +} + +/// Contains unmodified input transformations of source tables indexed by the Subgraph. +/// +/// May contain invalid or partial data. +#[derive(Debug, Clone, Deserialize)] +#[serde(rename_all = "camelCase")] +pub struct RawTransformer { + /// The version of this transformer. + /// + /// Must be a supported API version of the Nozzle Subgraph transformers API. + pub api_version: Version, + + /// The ABIs that SQL queries can reference to extract event signatures. + /// + /// SQL queries resolve `sg_event_signature('CONTRACT_NAME', 'EVENT_NAME')` calls + /// to full event signatures based on this list. + pub abis: Option>, + + /// The transformed tables that extract data from source tables for indexing. + pub tables: Vec, +} + +impl RawTransformer { + /// Parses, formats, and resolves the input transformer into a valid transformer. + async fn resolve( + self, + logger: &Logger, + link_resolver: &dyn LinkResolver, + nozzle_client: &impl nozzle::Client, + source: &Source, + ) -> Result { + let Self { + api_version, + abis, + tables, + } = self; + let _api_version = Self::resolve_api_version(api_version)?; + let abis = Self::resolve_abis(logger, link_resolver, abis).await?; + let tables = + Self::resolve_tables(logger, link_resolver, nozzle_client, tables, source, &abis) + .await?; + + Ok(Transformer { abis, tables }) + } + + fn resolve_api_version(api_version: Version) -> Result { + if !API_VERSIONS.contains(&api_version) { + return Err(Error::InvalidValue(anyhow!("invalid `api_version`"))); + } + + Ok(api_version) + } + + async fn resolve_abis( + logger: &Logger, + link_resolver: &dyn LinkResolver, + abis: Option>, + ) -> Result, Error> { + const MAX_ABIS: usize = 100; + + let Some(abis) = abis else { + return Ok(Vec::new()); + }; + + if abis.len() > MAX_ABIS { + return Err(Error::InvalidValue(anyhow!( + "`abis` cannot have more than {MAX_ABIS} ABIs" + ))); + } + + let abi_futs = abis.into_iter().enumerate().map(|(i, abi)| async move { + abi.resolve(logger, link_resolver) + .await + .map_err(|e| e.source_context(format!("invalid `abis` at index {i}"))) + }); + + try_join_all(abi_futs).await + } + + async fn resolve_tables( + logger: &Logger, + link_resolver: &dyn LinkResolver, + nozzle_client: &impl nozzle::Client, + tables: Vec, + source: &Source, + abis: &[Abi], + ) -> Result, Error> { + const MAX_TABLES: usize = 100; + + if tables.is_empty() { + return Err(Error::InvalidValue(anyhow!("`tables` cannot be empty"))); + } + + if tables.len() > MAX_TABLES { + return Err(Error::InvalidValue(anyhow!( + "`tables` cannot have more than {MAX_TABLES} tables" + ))); + } + + let table_futs = tables.into_iter().enumerate().map(|(i, table)| async move { + table + .resolve(logger, link_resolver, nozzle_client, source, abis) + .await + .map_err(|e| e.source_context(format!("invalid `tables` at index {i}"))) + }); + + try_join_all(table_futs).await + } +} + +/// Represents an unmodified input ABI of a smart contract. +/// +/// May contain invalid or partial data. +#[derive(Debug, Clone, Deserialize)] +#[serde(rename_all = "camelCase")] +pub struct RawAbi { + /// The name of the contract. + pub name: String, + + /// The IPFS link to the JSON ABI of the contract. + pub file: String, +} + +impl RawAbi { + /// Parses, formats, and resolves the input ABI into a valid ABI. + async fn resolve( + self, + logger: &Logger, + link_resolver: &dyn LinkResolver, + ) -> Result { + let Self { name, file } = self; + let name = Self::resolve_name(name)?; + let contract = Self::resolve_contract(logger, link_resolver, file).await?; + + Ok(Abi { name, contract }) + } + + fn resolve_name(name: String) -> Result { + Ident::new(name).map_err(|e| Error::InvalidValue(e.context("invalid `name`"))) + } + + async fn resolve_contract( + logger: &Logger, + link_resolver: &dyn LinkResolver, + file: String, + ) -> Result { + if file.is_empty() { + return Err(Error::InvalidValue(anyhow!("`file` cannot be empty"))); + } + + let file_bytes = link_resolver + .cat(logger, &(file.into())) + .await + .map_err(|e| Error::FailedToResolveFile(e.context("invalid `file`")))?; + + let contract: JsonAbi = serde_json::from_slice(&file_bytes) + .map_err(|e| Error::InvalidValue(anyhow!(e).context("invalid `file`")))?; + + Ok(contract) + } +} + +/// Represents an unmodified input transformed table that extracts data from source tables for indexing. +/// +/// May contain invalid or partial data. +#[derive(Debug, Clone, Deserialize)] +#[serde(rename_all = "camelCase")] +pub struct RawTable { + /// The name of the transformed table. + /// + /// Must reference a valid entity name from the Subgraph schema. + pub name: String, + + /// The SQL query that executes on the Nozzle server. + /// + /// Transforms the execution results into Subgraph entities. + pub query: Option, + + /// The IPFS link to the SQL query that executes on the Nozzle server. + /// + /// Transforms the execution results into Subgraph entities. + /// + /// Ignored when `query` is set. + pub file: Option, +} + +impl RawTable { + /// Parses, formats, and resolves the input table into a valid transformed table. + async fn resolve( + self, + logger: &Logger, + link_resolver: &dyn LinkResolver, + nozzle_client: &impl nozzle::Client, + source: &Source, + abis: &[Abi], + ) -> Result { + let Self { name, query, file } = self; + let name = Self::resolve_name(name)?; + let query = match Self::resolve_query(query, source, abis)? { + Some(query) => query, + None => Self::resolve_file(logger, link_resolver, file, source, abis).await?, + }; + let schema = Self::resolve_schema(logger, nozzle_client, &query).await?; + + Ok(Table { + name, + query, + schema, + }) + } + + fn resolve_name(name: String) -> Result { + Ident::new(name).map_err(|e| Error::InvalidValue(e.context("invalid `name`"))) + } + + fn resolve_query( + query: Option, + source: &Source, + abis: &[Abi], + ) -> Result, Error> { + let Some(query) = query else { + return Ok(None); + }; + + if query.is_empty() { + return Err(Error::InvalidValue(anyhow!("`query` cannot be empty"))); + } + + Query::new( + query, + &source.dataset, + &source.tables, + &source.address, + abis.iter().map(|abi| (&abi.name, &abi.contract)), + ) + .map(Some) + .map_err(|e| Error::InvalidValue(e.context("invalid `query`"))) + } + + async fn resolve_file( + logger: &Logger, + link_resolver: &dyn LinkResolver, + file: Option, + source: &Source, + abis: &[Abi], + ) -> Result { + let Some(file) = file else { + return Err(Error::InvalidValue(anyhow!("`file` cannot be empty"))); + }; + + if file.is_empty() { + return Err(Error::InvalidValue(anyhow!("`file` cannot be empty"))); + } + + let file_bytes = link_resolver + .cat(logger, &(file.into())) + .await + .map_err(|e| Error::FailedToResolveFile(e.context("invalid `file`")))?; + + let query = String::from_utf8(file_bytes) + .map_err(|e| Error::InvalidValue(anyhow!(e).context("invalid `file`")))?; + + if query.is_empty() { + return Err(Error::InvalidValue(anyhow!("`file` cannot be empty"))); + } + + Query::new( + query, + &source.dataset, + &source.tables, + &source.address, + abis.iter().map(|abi| (&abi.name, &abi.contract)), + ) + .map_err(|e| Error::InvalidValue(e.context("invalid `file`"))) + } + + async fn resolve_schema( + logger: &Logger, + nozzle_client: &impl nozzle::Client, + query: &Query, + ) -> Result { + let schema = nozzle_client.schema(logger, &query).await.map_err(|e| { + Error::FailedToExecuteQuery { + is_deterministic: e.is_deterministic(), + source: anyhow!(e).context("failed to load schema"), + } + })?; + + let check_required_column = |c: &[&str], kind: &str| { + if !c.iter().any(|&c| schema.column_with_name(c).is_some()) { + return Err(Error::InvalidQuery(anyhow!("query must return {kind}"))); + } + Ok(()) + }; + + check_required_column(column_aliases::BLOCK_NUMBER, "block numbers")?; + check_required_column(column_aliases::BLOCK_HASH, "block hashes")?; + check_required_column(column_aliases::BLOCK_TIMESTAMP, "block timestamps")?; + + Ok(schema) + } +} + +#[derive(Debug, Error)] +pub enum Error { + #[error("invalid value: {0:#}")] + InvalidValue(#[source] anyhow::Error), + + #[error("invalid query: {0:#}")] + InvalidQuery(#[source] anyhow::Error), + + #[error("failed to resolve file: {0:#}")] + FailedToResolveFile(#[source] anyhow::Error), + + #[error("failed to execute query: {source:#}")] + FailedToExecuteQuery { + source: anyhow::Error, + is_deterministic: bool, + }, +} + +impl Error { + /// Extends the source errors with additional context keeping the original error kind and the determinism. + fn source_context(self, cx: impl Into) -> Self { + match self { + Self::InvalidValue(e) => Self::InvalidValue(e.context(cx.into())), + Self::InvalidQuery(e) => Self::InvalidQuery(e.context(cx.into())), + Self::FailedToResolveFile(e) => Self::FailedToResolveFile(e.context(cx.into())), + Self::FailedToExecuteQuery { + source, + is_deterministic, + } => Self::FailedToExecuteQuery { + source: source.context(cx.into()), + is_deterministic, + }, + } + } +} + +impl IsDeterministic for Error { + fn is_deterministic(&self) -> bool { + match self { + Self::InvalidValue(_) => true, + Self::InvalidQuery(_) => true, + Self::FailedToResolveFile(_) => false, + Self::FailedToExecuteQuery { + is_deterministic, .. + } => *is_deterministic, + } + } +} diff --git a/graph/src/nozzle/manifest/mod.rs b/graph/src/nozzle/manifest/mod.rs new file mode 100644 index 00000000000..e294a50ddf5 --- /dev/null +++ b/graph/src/nozzle/manifest/mod.rs @@ -0,0 +1,21 @@ +pub mod data_source; + +use crate::schema::InputSchema; + +pub use self::data_source::DataSource; + +/// Represents a valid Nozzle Subgraph manifest. +/// +/// This manifest contains parsed, formatted, and resolved data. +#[derive(Debug, Clone)] +pub struct Manifest { + /// The schema of the Subgraph. + /// + /// Contains all the entities, aggregations, and relationships between them. + pub schema: InputSchema, + + /// The Nozzle data sources of the Subgraph. + /// + /// A Nozzle Subgraph can only contain Nozzle data sources. + pub data_sources: Vec, +} diff --git a/graph/src/nozzle/mod.rs b/graph/src/nozzle/mod.rs index c3d579e3464..9683c042f3c 100644 --- a/graph/src/nozzle/mod.rs +++ b/graph/src/nozzle/mod.rs @@ -5,6 +5,7 @@ pub mod codec; pub mod common; pub mod error; pub mod log; +pub mod manifest; pub mod schema; pub mod sql; pub mod stream_aggregator; @@ -12,4 +13,5 @@ pub mod stream_aggregator; pub use self::{ client::{flight_client::FlightClient, Client}, codec::Codec, + manifest::Manifest, }; diff --git a/graph/src/nozzle/sql/query/mod.rs b/graph/src/nozzle/sql/query/mod.rs index 8f9a928609b..c411ab74846 100644 --- a/graph/src/nozzle/sql/query/mod.rs +++ b/graph/src/nozzle/sql/query/mod.rs @@ -19,6 +19,7 @@ use crate::{cheap_clone::CheapClone, nozzle::common::Ident}; /// /// Parses, validates and resolves a SQL query and prepares it for execution on a Nozzle server. /// The data returned by executing this query is used to create Subgraph entities. +#[derive(Debug, Clone)] pub struct Query { /// The raw SQL AST that represents the SQL query. ast: ast::Query, @@ -30,15 +31,6 @@ pub struct Query { tables: Vec, } -/// Contains the ABI information that is used to resolve event signatures in SQL queries. -pub struct Abi<'a> { - /// The name of the contract. - pub name: &'a Ident, - - /// The JSON ABI of the contract. - pub contract: &'a JsonAbi, -} - impl Query { /// Parses, validates and resolves a SQL query and prepares it for execution on a Nozzle server. /// @@ -55,7 +47,7 @@ impl Query { dataset: &Ident, tables: &[Ident], source_address: &Address, - abis: impl IntoIterator>, + abis: impl IntoIterator, ) -> Result { let mut query = parse::query(sql).context("failed to parse SQL query")?; let abis = abis.into_iter().collect_vec(); @@ -117,7 +109,11 @@ impl Query { /// - Event signature function calls cannot be resolved /// /// The returned error is deterministic. - fn resolve(query: &mut ast::Query, source_address: &Address, abis: &[Abi<'_>]) -> Result<()> { + fn resolve( + query: &mut ast::Query, + source_address: &Address, + abis: &[(&Ident, &JsonAbi)], + ) -> Result<()> { resolve_source_address::resolve_source_address(query, source_address)?; resolve_event_signatures::resolve_event_signatures(query, abis)?; diff --git a/graph/src/nozzle/sql/query/resolve_event_signatures.rs b/graph/src/nozzle/sql/query/resolve_event_signatures.rs index a243cd12268..d67bc0263d8 100644 --- a/graph/src/nozzle/sql/query/resolve_event_signatures.rs +++ b/graph/src/nozzle/sql/query/resolve_event_signatures.rs @@ -1,9 +1,10 @@ use std::ops::ControlFlow; +use alloy::json_abi::JsonAbi; use anyhow::{bail, Context, Result}; use sqlparser_latest::ast::{self, visit_expressions_mut}; -use super::Abi; +use crate::nozzle::common::Ident; static FUNCTION_NAME: &str = "sg_event_signature"; @@ -18,7 +19,10 @@ static FUNCTION_NAME: &str = "sg_event_signature"; /// - The event name is not found in `abis` /// /// The returned error is deterministic. -pub(super) fn resolve_event_signatures(query: &mut ast::Query, abis: &[Abi<'_>]) -> Result<()> { +pub(super) fn resolve_event_signatures( + query: &mut ast::Query, + abis: &[(&Ident, &JsonAbi)], +) -> Result<()> { let visit_result = visit_expressions_mut(query, |expr| match visit_expr(expr, abis) { Ok(()) => ControlFlow::Continue(()), Err(e) => ControlFlow::Break(e), @@ -31,7 +35,7 @@ pub(super) fn resolve_event_signatures(query: &mut ast::Query, abis: &[Abi<'_>]) Ok(()) } -fn visit_expr(expr: &mut ast::Expr, abis: &[Abi<'_>]) -> Result<()> { +fn visit_expr(expr: &mut ast::Expr, abis: &[(&Ident, &JsonAbi)]) -> Result<()> { let ast::Expr::Function(function) = expr else { return Ok(()); }; @@ -93,13 +97,13 @@ fn get_arg<'a>(arg: &'a ast::FunctionArg) -> Option<&'a str> { } fn get_event<'a>( - abis: &'a [Abi<'_>], + abis: &'a [(&Ident, &JsonAbi)], contract_name: &str, event_name: &str, ) -> Option<&'a alloy::json_abi::Event> { abis.iter() - .find(|abi| abi.name.as_str() == contract_name) - .map(|abi| abi.contract.event(event_name)) + .find(|(name, _)| name.as_str() == contract_name) + .map(|(_, contract)| contract.event(event_name)) .flatten() .map(|events| events.first()) .flatten() From e7e11f803a89bae15022c314eccc67c4adcb7fb2 Mon Sep 17 00:00:00 2001 From: Ion Suman <47307091+isum@users.noreply.github.com> Date: Tue, 28 Oct 2025 17:19:39 +0200 Subject: [PATCH 08/40] feat(graph): add reorg handling to the Nozzle FlightClient --- graph/src/nozzle/client/flight_client.rs | 262 ++++++++++++++++++---- graph/src/nozzle/client/mod.rs | 55 ++++- graph/src/nozzle/stream_aggregator/mod.rs | 14 +- 3 files changed, 284 insertions(+), 47 deletions(-) diff --git a/graph/src/nozzle/client/flight_client.rs b/graph/src/nozzle/client/flight_client.rs index ea89aba6005..19d4981388d 100644 --- a/graph/src/nozzle/client/flight_client.rs +++ b/graph/src/nozzle/client/flight_client.rs @@ -1,25 +1,32 @@ use std::{ + collections::HashMap, hash::{Hash, Hasher}, + ops::RangeInclusive, time::Duration, }; use ahash::AHasher; -use arrow::{array::RecordBatch, datatypes::Schema, error::ArrowError}; +use alloy::primitives::{BlockHash, BlockNumber}; +use arrow::{datatypes::Schema, error::ArrowError}; use arrow_flight::{ - error::FlightError, flight_service_client::FlightServiceClient, + decode::DecodedPayload, error::FlightError, flight_service_client::FlightServiceClient, sql::client::FlightSqlServiceClient, }; use async_stream::try_stream; use bytes::Bytes; use futures03::{future::BoxFuture, stream::BoxStream, StreamExt}; +use http::Uri; use lazy_regex::regex_is_match; -use slog::{debug, Logger}; +use serde::{Deserialize, Serialize}; +use slog::{debug, trace, Logger}; use thiserror::Error; use tonic::transport::{Channel, ClientTlsConfig, Endpoint}; use crate::{ nozzle::{ - client::Client, + client::{ + Client, LatestBlockBeforeReorg, RequestMetadata, ResponseBatch, ResumeStreamingQuery, + }, error, log::{one_line, Logger as _}, }, @@ -34,27 +41,11 @@ pub struct FlightClient { channel: Channel, } -#[derive(Debug, Error)] -pub enum Error { - // Address excluded to avoid leaking sensitive details in logs - #[error("invalid address")] - InvalidAddress, - - #[error("service failed: {0:#}")] - Service(#[source] ArrowError), - - #[error("stream failed: {0:#}")] - Stream(#[source] FlightError), -} - impl FlightClient { - /// Constructs a new Nozzle client connected to the specified Nozzle Flight service address. - pub fn new(addr: impl Into) -> Result { - let addr: Bytes = addr.into(); - let is_https = std::str::from_utf8(&addr).map_or(false, |a| a.starts_with("https://")); - - let mut endpoint = Endpoint::from_shared(addr) - .map_err(|_e| Error::InvalidAddress)? + /// Creates a new Nozzle client connected to the specified Nozzle Flight service address. + pub async fn new(addr: Uri) -> Result { + let is_https = addr.scheme() == Some(&http::uri::Scheme::HTTPS); + let mut endpoint = Endpoint::from(addr) .tcp_keepalive(Some(Duration::from_secs(30))) .keep_alive_while_idle(true) .http2_adaptive_window(true) @@ -70,7 +61,7 @@ impl FlightClient { } Ok(Self { - channel: endpoint.connect_lazy(), + channel: endpoint.connect().await.map_err(Error::Connection)?, }) } @@ -116,18 +107,40 @@ impl Client for FlightClient { &self, logger: &Logger, query: impl ToString, - ) -> BoxStream<'static, Result> { - let logger = logger.component("nozzle::FlightClient"); - let mut raw_client = self.raw_client(); + request_metadata: Option, + ) -> BoxStream<'static, Result> { let query = query.to_string(); - let query_id = query_id(&query); + let logger = logger + .component("nozzle::FlightClient") + .new(slog::o!("query_id" => query_id(&query))); + + let mut raw_client = self.raw_client(); + let mut prev_block_ranges: Vec = Vec::new(); + + if let Some(request_metadata) = request_metadata { + let RequestMetadata { + resume_streaming_query, + } = request_metadata; + + if let Some(resume_streaming_query) = resume_streaming_query { + prev_block_ranges = resume_streaming_query + .iter() + .cloned() + .map(Into::into) + .collect(); + + raw_client.set_header( + "nozzle-resume", + serialize_resume_streaming_query(resume_streaming_query), + ); + } + } try_stream! { const TXN_ID: Option = None; debug!(logger, "Executing SQL query"; - "query" => &*one_line(&query), - "query_id" => query_id + "query" => &*one_line(&query) ); let flight_info = raw_client @@ -135,30 +148,56 @@ impl Client for FlightClient { .await .map_err(Error::Service)?; - for endpoint in flight_info.endpoint { + for (endpoint_index, endpoint) in flight_info.endpoint.into_iter().enumerate() { let Some(ticket) = endpoint.ticket else { continue; }; - let mut stream = raw_client.do_get(ticket).await.map_err(Error::Service)?; + let mut stream = raw_client.do_get(ticket).await.map_err(Error::Service)?.into_inner(); let mut batch_index = 0u32; + let mut prev_block_ranges = prev_block_ranges.clone(); while let Some(batch_result) = stream.next().await { - debug!(logger, "Received a new record batch"; - "query_id" => query_id, + let flight_data = batch_result.map_err(Error::Stream)?; + let app_metadata = flight_data.inner.app_metadata; + let payload = flight_data.payload; + + let record_batch = match payload { + DecodedPayload::None => { + trace!(logger, "Received empty data"; + "endpoint_index" => endpoint_index + ); + continue + }, + DecodedPayload::Schema(_) => { + trace!(logger, "Received schema only"; + "endpoint_index" => endpoint_index + ); + continue + } + DecodedPayload::RecordBatch(record_batch) => record_batch, + }; + let block_ranges = Metadata::parse(&app_metadata)?.ranges; + + trace!(logger, "Received a new record batch"; + "endpoint_index" => endpoint_index, "batch_index" => batch_index, - "num_rows" => batch_result.as_ref().map_or(0, |b| b.num_rows()), - "memory_size_bytes" => batch_result.as_ref().map_or(0, |b| b.get_array_memory_size()) + "num_rows" => record_batch.num_rows(), + "memory_size_bytes" => record_batch.get_array_memory_size(), + "block_ranges" => ?block_ranges ); - let record_batch = batch_result.map_err(Error::Stream)?; - yield record_batch; + if let Some(reorg) = detect_reorg(&block_ranges, &prev_block_ranges) { + yield ResponseBatch::Reorg(reorg); + } + + yield ResponseBatch::Batch { data: record_batch}; batch_index += 1; + prev_block_ranges = block_ranges; } debug!(logger, "Query execution completed successfully"; - "query_id" => query_id, "batch_count" => batch_index ); } @@ -167,17 +206,37 @@ impl Client for FlightClient { } } +#[derive(Debug, Error)] +pub enum Error { + #[error("invalid metadata: {0:#}")] + InvalidMetadata(#[source] anyhow::Error), + + #[error("connection failed: {0:#}")] + Connection(#[source] tonic::transport::Error), + + #[error("service failed: {0:#}")] + Service(#[source] ArrowError), + + #[error("stream failed: {0:#}")] + Stream(#[source] FlightError), +} + impl error::IsDeterministic for Error { fn is_deterministic(&self) -> bool { - static PATTERNS: &[&str] = &[ + let msg = match self { + Self::InvalidMetadata(_) => return true, + Self::Connection(_) => return false, + Self::Service(e) => e.to_string(), + Self::Stream(_) => return false, + }; + + static DETERMINISTIC_ERROR_PATTERNS: &[&str] = &[ r#", message: "SQL parse error:"#, r#", message: "error looking up datasets:"#, r#", message: "planning error:"#, ]; - let msg = self.to_string(); - - for &pattern in PATTERNS { + for &pattern in DETERMINISTIC_ERROR_PATTERNS { if msg.contains(pattern) { return true; } @@ -191,6 +250,63 @@ impl error::IsDeterministic for Error { } } +/// Metadata received with every record batch. +#[derive(Debug, Clone, Deserialize)] +struct Metadata { + /// Block ranges processed by the Nozzle server to produce the record batch. + ranges: Vec, +} + +impl Metadata { + /// Parses and returns the metadata. + fn parse(app_metadata: &[u8]) -> Result { + if app_metadata.is_empty() { + return Ok(Self { ranges: Vec::new() }); + } + + serde_json::from_slice::(app_metadata).map_err(|e| Error::InvalidMetadata(e.into())) + } +} + +/// Block range processed by the Nozzle server to produce a record batch. +#[derive(Debug, Clone, PartialEq, Eq, Deserialize)] +struct BlockRange { + /// Network that contains the source data for the dataset. + network: String, + + /// Block numbers processed. + numbers: RangeInclusive, + + /// Hash of the last block in the block range. + hash: BlockHash, + + /// Hash of the parent block of the first block in the block range. + prev_hash: Option, +} + +impl BlockRange { + /// Returns the first block number in the range. + fn start(&self) -> BlockNumber { + *self.numbers.start() + } + + /// Returns the last block number in the range. + fn end(&self) -> BlockNumber { + *self.numbers.end() + } +} + +impl From for BlockRange { + fn from(resume: ResumeStreamingQuery) -> Self { + Self { + network: resume.network, + numbers: resume.block_number..=resume.block_number, + hash: resume.block_hash, + prev_hash: None, + } + } +} + /// Generates an ID from a SQL query for log correlation. /// /// The ID allows connecting related logs without including the full SQL @@ -200,3 +316,61 @@ fn query_id(query: &str) -> u32 { query.hash(&mut hasher); hasher.finish() as u32 } + +/// Serializes the information required to resume a streaming SQL query to JSON. +fn serialize_resume_streaming_query(resume_streaming_query: Vec) -> String { + #[derive(Serialize)] + struct Block { + number: BlockNumber, + hash: BlockHash, + } + + let mapping: HashMap = resume_streaming_query + .into_iter() + .map( + |ResumeStreamingQuery { + network, + block_number: number, + block_hash: hash, + }| { (network, Block { number, hash }) }, + ) + .collect(); + + serde_json::to_string(&mapping).unwrap() +} + +/// Detects whether a reorg occurred during query execution. +/// +/// Compares current block ranges with block ranges from the previous record batch +/// to detect non-incremental batches. When a non-incremental batch is detected, +/// returns the block number and hash of the parent block of the first block +/// after reorg for every processed network. +/// +/// Returns `None` when no reorgs are detected. +fn detect_reorg( + block_ranges: &[BlockRange], + prev_block_ranges: &[BlockRange], +) -> Option> { + Some( + block_ranges + .iter() + .filter_map(|block_range| { + let prev_block_range = prev_block_ranges + .iter() + .find(|prev_block_range| prev_block_range.network == block_range.network)?; + + if block_range != prev_block_range && block_range.start() <= prev_block_range.end() + { + return Some(LatestBlockBeforeReorg { + network: block_range.network.clone(), + block_number: block_range.start().saturating_sub(1), + block_hash: block_range.prev_hash, + }); + } + + None + }) + .collect::>(), + ) + .filter(|v| !v.is_empty()) +} diff --git a/graph/src/nozzle/client/mod.rs b/graph/src/nozzle/client/mod.rs index 1f6e7e517c0..f4c2fddffd5 100644 --- a/graph/src/nozzle/client/mod.rs +++ b/graph/src/nozzle/client/mod.rs @@ -2,6 +2,7 @@ pub mod flight_client; use std::error::Error; +use alloy::primitives::{BlockHash, BlockNumber}; use arrow::{array::RecordBatch, datatypes::Schema}; use futures03::{future::BoxFuture, stream::BoxStream}; use slog::Logger; @@ -24,5 +25,57 @@ pub trait Client { &self, logger: &Logger, query: impl ToString, - ) -> BoxStream<'static, Result>; + request_metadata: Option, + ) -> BoxStream<'static, Result>; +} + +/// Metadata sent to the Nozzle server with the SQL query. +#[derive(Debug, Clone)] +pub struct RequestMetadata { + /// Allows resuming streaming SQL queries from any block. + pub resume_streaming_query: Option>, +} + +/// Resumes a streaming SQL query from the specified block. +#[derive(Debug, Clone)] +pub struct ResumeStreamingQuery { + /// Network that contains the source data for the dataset. + pub network: String, + + /// Block number after which the SQL query should resume. + /// + /// An invalid block number triggers a reorg message. + pub block_number: BlockNumber, + + /// Block hash of the block after which the SQL query should resume. + /// + /// An invalid block hash triggers a reorg message. + pub block_hash: BlockHash, +} + +/// Represents a batch response resulting from query execution on the Nozzle server. +#[derive(Debug, Clone)] +pub enum ResponseBatch { + /// Contains the batch data received from the Nozzle server. + Batch { data: RecordBatch }, + + /// Contains the reorg message received from the Nozzle server. + /// + /// It is received before the record batch that contains the data after the reorg. + Reorg(Vec), +} + +/// Represents the parent block of the first block after the reorg. +#[derive(Debug, Clone)] +pub struct LatestBlockBeforeReorg { + /// Network that contains the source data for the dataset. + pub network: String, + + /// Block number of the parent block of the first block after the reorg. + pub block_number: BlockNumber, + + /// Block hash of the parent block of the first block after the reorg. + /// + /// It is `None` when the reorg affects every block in the blockchain. + pub block_hash: Option, } diff --git a/graph/src/nozzle/stream_aggregator/mod.rs b/graph/src/nozzle/stream_aggregator/mod.rs index 4ec9af8f4ba..00583e423c1 100644 --- a/graph/src/nozzle/stream_aggregator/mod.rs +++ b/graph/src/nozzle/stream_aggregator/mod.rs @@ -12,7 +12,7 @@ use futures03::{stream::BoxStream, Stream, StreamExt, TryStreamExt}; use slog::{debug, info, Logger}; use self::record_batch::Buffer; -use crate::nozzle::{error::IsDeterministic, log::Logger as _}; +use crate::nozzle::{client::ResponseBatch, error::IsDeterministic, log::Logger as _}; pub use self::{ error::Error, @@ -49,7 +49,7 @@ impl StreamAggregator { /// Creates a new stream aggregator from the `streams` with a bounded buffer. pub fn new( logger: &Logger, - streams: impl IntoIterator>>, + streams: impl IntoIterator>>, max_buffer_size: usize, ) -> Self where @@ -63,6 +63,16 @@ impl StreamAggregator { .map(|(stream_index, stream)| { stream .map_err(move |e| Error::stream(stream_index, e)) + .try_filter_map(move |response_batch| async move { + match response_batch { + ResponseBatch::Batch { data } => Ok(Some(data)), + ResponseBatch::Reorg(_) => Err(Error::Stream { + stream_index, + source: anyhow!("chain reorg"), + is_deterministic: false, + }), + } + }) .boxed() }) .collect::>(); From be604dbddc9538e44cff1eb7ea5128462808aa15 Mon Sep 17 00:00:00 2001 From: Ion Suman <47307091+isum@users.noreply.github.com> Date: Tue, 28 Oct 2025 17:19:39 +0200 Subject: [PATCH 09/40] feat(graph, core): extend SubgraphInstanceManager trait --- core/src/subgraph/instance_manager.rs | 8 ++++++++ graph/src/components/subgraph/instance_manager.rs | 7 +++++++ 2 files changed, 15 insertions(+) diff --git a/core/src/subgraph/instance_manager.rs b/core/src/subgraph/instance_manager.rs index 81c1a3ccd1a..845d666c414 100644 --- a/core/src/subgraph/instance_manager.rs +++ b/core/src/subgraph/instance_manager.rs @@ -58,6 +58,14 @@ pub struct SubgraphInstanceManager { #[async_trait] impl SubgraphInstanceManagerTrait for SubgraphInstanceManager { + fn can_manage( + &self, + _deployment: &DeploymentLocator, + _raw_manifest: &serde_yaml::Mapping, + ) -> bool { + true + } + async fn start_subgraph( self: Arc, loc: DeploymentLocator, diff --git a/graph/src/components/subgraph/instance_manager.rs b/graph/src/components/subgraph/instance_manager.rs index c9f076a2a36..7caf64626ba 100644 --- a/graph/src/components/subgraph/instance_manager.rs +++ b/graph/src/components/subgraph/instance_manager.rs @@ -10,6 +10,13 @@ use crate::components::store::DeploymentLocator; /// subgraph instance manager stops and removes the corresponding instance. #[async_trait::async_trait] pub trait SubgraphInstanceManager: Send + Sync + 'static { + /// Returns `true` if this manager has the necessary capabilities to manage the subgraph. + fn can_manage( + &self, + deployment: &DeploymentLocator, + raw_manifest: &serde_yaml::Mapping, + ) -> bool; + async fn start_subgraph( self: Arc, deployment: DeploymentLocator, From 36153013cf9ca7fbc50a49c03c26229764667e40 Mon Sep 17 00:00:00 2001 From: Ion Suman <47307091+isum@users.noreply.github.com> Date: Tue, 28 Oct 2025 17:19:39 +0200 Subject: [PATCH 10/40] feat(core, graph, node): allow multiple subgraph instance managers --- Cargo.lock | 35 +- Cargo.toml | 3 + core/Cargo.toml | 9 + core/src/lib.rs | 8 +- core/src/subgraph/instance_manager.rs | 8 - core/src/subgraph/mod.rs | 2 - core/src/subgraph/provider.rs | 101 ----- core/src/subgraph/registrar.rs | 23 +- core/src/subgraph_provider.rs | 365 ++++++++++++++++++ .../components/subgraph/instance_manager.rs | 15 +- graph/src/components/subgraph/mod.rs | 2 - graph/src/components/subgraph/provider.rs | 10 - graph/src/lib.rs | 3 +- graph/src/nozzle/manifest/data_source/raw.rs | 13 +- node/Cargo.toml | 3 + node/src/launcher.rs | 54 +-- node/src/manager/commands/run.rs | 39 +- tests/Cargo.toml | 1 + tests/src/fixture/mod.rs | 72 ++-- tests/tests/runner_tests.rs | 8 +- 20 files changed, 530 insertions(+), 244 deletions(-) delete mode 100644 core/src/subgraph/provider.rs create mode 100644 core/src/subgraph_provider.rs delete mode 100644 graph/src/components/subgraph/provider.rs diff --git a/Cargo.lock b/Cargo.lock index 979fccde810..9e5af2832f1 100644 --- a/Cargo.lock +++ b/Cargo.lock @@ -732,7 +732,7 @@ dependencies = [ "serde_json", "tokio", "tokio-stream", - "tokio-util 0.7.11", + "tokio-util 0.7.17", "tower-service 0.3.3 (registry+https://github.com/rust-lang/crates.io-index)", ] @@ -1348,7 +1348,7 @@ dependencies = [ "memchr", "pin-project-lite", "tokio", - "tokio-util 0.7.11", + "tokio-util 0.7.17", ] [[package]] @@ -2868,13 +2868,20 @@ dependencies = [ "atomic_refcell", "bytes", "cid", + "futures 0.3.31", "graph", "graph-chain-ethereum", "graph-chain-near", "graph-chain-substreams", "graph-runtime-wasm", + "itertools", + "parking_lot", "serde_yaml", + "slog", + "strum", "thiserror 2.0.16", + "tokio", + "tokio-util 0.7.17", "tower 0.5.2 (git+https://github.com/tower-rs/tower.git)", "tower-test", "wiremock", @@ -2926,6 +2933,7 @@ dependencies = [ "serde", "shellexpand", "termcolor", + "tokio-util 0.7.17", "url", ] @@ -3071,6 +3079,7 @@ dependencies = [ "slog", "tokio", "tokio-stream", + "tokio-util 0.7.17", ] [[package]] @@ -3182,7 +3191,7 @@ dependencies = [ "indexmap 2.11.4", "slab", "tokio", - "tokio-util 0.7.11", + "tokio-util 0.7.17", "tracing", ] @@ -3201,7 +3210,7 @@ dependencies = [ "indexmap 2.11.4", "slab", "tokio", - "tokio-util 0.7.11", + "tokio-util 0.7.17", "tracing", ] @@ -3813,7 +3822,7 @@ source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "4b0f83760fb341a774ed326568e19f5a863af4a952def8c39f9ab92fd95b88e5" dependencies = [ "equivalent", - "hashbrown 0.15.2", + "hashbrown 0.16.1", "serde", "serde_core", ] @@ -5501,7 +5510,7 @@ dependencies = [ "sha1_smol", "socket2 0.5.7", "tokio", - "tokio-util 0.7.11", + "tokio-util 0.7.17", "url", ] @@ -5615,7 +5624,7 @@ dependencies = [ "tokio", "tokio-native-tls", "tokio-rustls", - "tokio-util 0.7.11", + "tokio-util 0.7.17", "tower 0.5.2 (registry+https://github.com/rust-lang/crates.io-index)", "tower-http", "tower-service 0.3.3 (registry+https://github.com/rust-lang/crates.io-index)", @@ -6939,7 +6948,7 @@ dependencies = [ "rand 0.9.2", "socket2 0.5.7", "tokio", - "tokio-util 0.7.11", + "tokio-util 0.7.17", "whoami", ] @@ -6974,7 +6983,7 @@ dependencies = [ "futures-core", "pin-project-lite", "tokio", - "tokio-util 0.7.11", + "tokio-util 0.7.17", ] [[package]] @@ -7019,9 +7028,9 @@ dependencies = [ [[package]] name = "tokio-util" -version = "0.7.11" +version = "0.7.17" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "9cf6b47b3771c49ac75ad09a6162f53ad4b8088b76ac60e8ec1455b31a189fe1" +checksum = "2efa149fe76073d6e8fd97ef4f4eca7b67f599660115591483572e406e165594" dependencies = [ "bytes", "futures-core", @@ -7177,7 +7186,7 @@ dependencies = [ "rand 0.8.5", "slab", "tokio", - "tokio-util 0.7.11", + "tokio-util 0.7.17", "tower-layer 0.3.3 (registry+https://github.com/rust-lang/crates.io-index)", "tower-service 0.3.3 (registry+https://github.com/rust-lang/crates.io-index)", "tracing", @@ -7212,7 +7221,7 @@ dependencies = [ "slab", "sync_wrapper 1.0.1", "tokio", - "tokio-util 0.7.11", + "tokio-util 0.7.17", "tower-layer 0.3.3 (git+https://github.com/tower-rs/tower.git)", "tower-service 0.3.3 (git+https://github.com/tower-rs/tower.git)", "tracing", diff --git a/Cargo.toml b/Cargo.toml index 3c66469d523..351063d4b34 100644 --- a/Cargo.toml +++ b/Cargo.toml @@ -103,9 +103,12 @@ ahash = "0.8.11" alloy = { version = "1.0.12", default-features = false, features = ["json-abi", "serde"] } arrow = { version = "=55.0.0" } arrow-flight = { version = "=55.0.0", features = ["flight-sql-experimental"] } +futures = "0.3.31" heck = "0.5.0" lazy-regex = "3.4.1" +parking_lot = "0.12.4" sqlparser-latest = { version = "0.57.0", package = "sqlparser", features = ["visitor"] } +tokio-util = "0.7.15" # Incremental compilation on Rust 1.58 causes an ICE on build. As soon as graph node builds again, these can be removed. [profile.test] diff --git a/core/Cargo.toml b/core/Cargo.toml index 0a5440b2b30..72fc4ad05ea 100644 --- a/core/Cargo.toml +++ b/core/Cargo.toml @@ -19,6 +19,15 @@ thiserror = { workspace = true } cid = "0.11.1" anyhow = "1.0" +# Dependencies related to Amp subgraphs +futures.workspace = true +itertools.workspace = true +parking_lot.workspace = true +slog.workspace = true +strum.workspace = true +tokio-util.workspace = true +tokio.workspace = true + [dev-dependencies] tower-test = { git = "https://github.com/tower-rs/tower.git" } wiremock = "0.6.5" diff --git a/core/src/lib.rs b/core/src/lib.rs index 448bb1041fd..1e9a7bfbdc3 100644 --- a/core/src/lib.rs +++ b/core/src/lib.rs @@ -1,8 +1,4 @@ pub mod polling_monitor; -mod subgraph; - -pub use crate::subgraph::{ - SubgraphAssignmentProvider, SubgraphInstanceManager, SubgraphRegistrar, SubgraphRunner, - SubgraphTriggerProcessor, -}; +pub mod subgraph; +pub mod subgraph_provider; diff --git a/core/src/subgraph/instance_manager.rs b/core/src/subgraph/instance_manager.rs index 845d666c414..81c1a3ccd1a 100644 --- a/core/src/subgraph/instance_manager.rs +++ b/core/src/subgraph/instance_manager.rs @@ -58,14 +58,6 @@ pub struct SubgraphInstanceManager { #[async_trait] impl SubgraphInstanceManagerTrait for SubgraphInstanceManager { - fn can_manage( - &self, - _deployment: &DeploymentLocator, - _raw_manifest: &serde_yaml::Mapping, - ) -> bool { - true - } - async fn start_subgraph( self: Arc, loc: DeploymentLocator, diff --git a/core/src/subgraph/mod.rs b/core/src/subgraph/mod.rs index 45f8d5b98ef..8f6bc932daa 100644 --- a/core/src/subgraph/mod.rs +++ b/core/src/subgraph/mod.rs @@ -3,7 +3,6 @@ mod error; mod inputs; mod instance_manager; mod loader; -mod provider; mod registrar; mod runner; mod state; @@ -11,7 +10,6 @@ mod stream; mod trigger_processor; pub use self::instance_manager::SubgraphInstanceManager; -pub use self::provider::SubgraphAssignmentProvider; pub use self::registrar::SubgraphRegistrar; pub use self::runner::SubgraphRunner; pub use self::trigger_processor::*; diff --git a/core/src/subgraph/provider.rs b/core/src/subgraph/provider.rs deleted file mode 100644 index 2ea4327838b..00000000000 --- a/core/src/subgraph/provider.rs +++ /dev/null @@ -1,101 +0,0 @@ -use std::sync::Mutex; -use std::{collections::HashSet, time::Instant}; - -use async_trait::async_trait; - -use graph::{ - components::store::{DeploymentId, DeploymentLocator}, - prelude::{SubgraphAssignmentProvider as SubgraphAssignmentProviderTrait, *}, -}; - -#[derive(Debug)] -struct DeploymentRegistry { - subgraphs_deployed: Arc>>, - subgraph_metrics: Arc, -} - -impl DeploymentRegistry { - fn new(subgraph_metrics: Arc) -> Self { - Self { - subgraphs_deployed: Arc::new(Mutex::new(HashSet::new())), - subgraph_metrics, - } - } - - fn insert(&self, id: DeploymentId) -> bool { - if !self.subgraphs_deployed.lock().unwrap().insert(id) { - return false; - } - - self.subgraph_metrics.deployment_count.inc(); - true - } - - fn remove(&self, id: &DeploymentId) -> bool { - if !self.subgraphs_deployed.lock().unwrap().remove(id) { - return false; - } - - self.subgraph_metrics.deployment_count.dec(); - true - } -} - -pub struct SubgraphAssignmentProvider { - logger_factory: LoggerFactory, - deployment_registry: DeploymentRegistry, - instance_manager: Arc, -} - -impl SubgraphAssignmentProvider { - pub fn new( - logger_factory: &LoggerFactory, - instance_manager: I, - subgraph_metrics: Arc, - ) -> Self { - let logger = logger_factory.component_logger("SubgraphAssignmentProvider", None); - let logger_factory = logger_factory.with_parent(logger.clone()); - - // Create the subgraph provider - SubgraphAssignmentProvider { - logger_factory, - instance_manager: Arc::new(instance_manager), - deployment_registry: DeploymentRegistry::new(subgraph_metrics), - } - } -} - -#[async_trait] -impl SubgraphAssignmentProviderTrait for SubgraphAssignmentProvider { - async fn start(&self, loc: DeploymentLocator, stop_block: Option) { - let logger = self.logger_factory.subgraph_logger(&loc); - - // If subgraph ID already in set - if !self.deployment_registry.insert(loc.id) { - info!(logger, "Subgraph deployment is already running"); - - return; - } - - let start_time = Instant::now(); - - self.instance_manager - .cheap_clone() - .start_subgraph(loc, stop_block) - .await; - - debug!( - logger, - "Subgraph started"; - "start_ms" => start_time.elapsed().as_millis() - ); - } - - async fn stop(&self, deployment: DeploymentLocator) { - // If subgraph ID was in set - if self.deployment_registry.remove(&deployment.id) { - // Shut down subgraph processing - self.instance_manager.stop_subgraph(deployment).await; - } - } -} diff --git a/core/src/subgraph/registrar.rs b/core/src/subgraph/registrar.rs index b05ccdf4e33..2f468edda30 100644 --- a/core/src/subgraph/registrar.rs +++ b/core/src/subgraph/registrar.rs @@ -14,10 +14,7 @@ use graph::futures03; use graph::futures03::future::TryFutureExt; use graph::futures03::Stream; use graph::futures03::StreamExt; -use graph::prelude::{ - CreateSubgraphResult, SubgraphAssignmentProvider as SubgraphAssignmentProviderTrait, - SubgraphRegistrar as SubgraphRegistrarTrait, *, -}; +use graph::prelude::{CreateSubgraphResult, SubgraphRegistrar as SubgraphRegistrarTrait, *}; use graph::tokio_retry::Retry; use graph::util::futures::retry_strategy; use graph::util::futures::RETRY_DEFAULT_LIMIT; @@ -38,7 +35,7 @@ pub struct SubgraphRegistrar { impl SubgraphRegistrar where - P: SubgraphAssignmentProviderTrait, + P: graph::components::subgraph::SubgraphInstanceManager, S: SubgraphStore, SM: SubscriptionManager, { @@ -160,11 +157,14 @@ where // Start subgraph on this node debug!(logger, "Deployment assignee is this node"; "assigned_to" => assigned, "action" => "add"); - self.provider.start(deployment, None).await; + self.provider + .cheap_clone() + .start_subgraph(deployment, None) + .await; } else { // Ensure it is removed from this node debug!(logger, "Deployment assignee is not this node"; "assigned_to" => assigned, "action" => "remove"); - self.provider.stop(deployment).await + self.provider.stop_subgraph(deployment).await } } else { // Was added/updated, but is now gone. @@ -172,10 +172,7 @@ where } } AssignmentOperation::Removed => { - // Send remove event without checking node ID. - // If node ID does not match, then this is a no-op when handled in - // assignment provider. - self.provider.stop(deployment).await; + self.provider.stop_subgraph(deployment).await; } } } @@ -210,7 +207,7 @@ where let provider = self.provider.cheap_clone(); graph::spawn(async move { - provider.start(id, None).await; + provider.start_subgraph(id, None).await; drop(sender) }); } @@ -225,7 +222,7 @@ where #[async_trait] impl SubgraphRegistrarTrait for SubgraphRegistrar where - P: SubgraphAssignmentProviderTrait, + P: graph::components::subgraph::SubgraphInstanceManager, S: SubgraphStore, SM: SubscriptionManager, { diff --git a/core/src/subgraph_provider.rs b/core/src/subgraph_provider.rs new file mode 100644 index 00000000000..c54583a938f --- /dev/null +++ b/core/src/subgraph_provider.rs @@ -0,0 +1,365 @@ +use std::{collections::HashMap, sync::Arc, time::Instant}; + +use graph::{ + cheap_clone::CheapClone as _, + components::{ + link_resolver::{LinkResolver, LinkResolverContext}, + metrics::subgraph::SubgraphCountMetric, + store::DeploymentLocator, + subgraph::SubgraphInstanceManager, + }, + log::factory::LoggerFactory, + nozzle, +}; +use itertools::Itertools as _; +use parking_lot::RwLock; +use slog::{debug, error}; +use tokio_util::sync::CancellationToken; + +/// Starts and stops subgraph deployments. +/// +/// For each subgraph deployment, checks the subgraph processing kind +/// and finds the appropriate subgraph instance manager to handle the +/// processing of the subgraph deployment. +/// +/// This is required to support both trigger-based subgraphs and Amp-powered subgraphs, +/// which have separate runners. +pub struct SubgraphProvider { + logger_factory: LoggerFactory, + count_metrics: Arc, + link_resolver: Arc, + + /// Stops active subgraph start request tasks. + /// + /// When a subgraph deployment start request is processed, a background task is created + /// to load the subgraph manifest and determine the subgraph processing kind. The processing + /// kind is then used to find the appropriate subgraph instance manager. This token stops + /// all tasks that are still loading manifests or waiting for subgraphs to start. + cancel_token: CancellationToken, + + /// Contains the enabled subgraph instance managers. + /// + /// Only subgraphs for which there is an appropriate instance manager will be started. + instance_managers: SubgraphInstanceManagers, + + /// Maintains a list of started subgraphs with their processing kinds. + /// + /// Used to forward subgraph deployment stop requests to the appropriate subgraph instance manager. + assignments: SubgraphAssignments, +} + +impl SubgraphProvider { + /// Creates a new subgraph provider. + /// + /// # Arguments + /// - `logger_factory`: Creates loggers for each subgraph deployment start/stop request + /// - `count_metrics`: Tracks the number of started subgraph deployments + /// - `link_resolver`: Loads subgraph manifests to determine the subgraph processing kinds + /// - `cancel_token`: Stops active subgraph start request tasks + /// - `instance_managers`: Contains the enabled subgraph instance managers + pub fn new( + logger_factory: &LoggerFactory, + count_metrics: Arc, + link_resolver: Arc, + cancel_token: CancellationToken, + instance_managers: SubgraphInstanceManagers, + ) -> Self { + let logger = logger_factory.component_logger("SubgraphProvider", None); + let logger_factory = logger_factory.with_parent(logger.cheap_clone()); + + debug!(logger, "Creating subgraph provider"; + "enabled_subgraph_processing_kinds" => instance_managers.0.keys().join(", ") + ); + + Self { + logger_factory, + count_metrics, + link_resolver, + cancel_token, + instance_managers, + assignments: SubgraphAssignments::new(), + } + } + + /// Starts a subgraph deployment with the appropriate subgraph instance manager. + /// + /// Loads the subgraph manifest for the specified deployment locator, determines + /// the subgraph processing kind, finds the required instance manager, and forwards + /// the start request to that instance manager. Keeps the subgraph processing kind + /// in memory for handling the stop requests. + async fn assign_and_start_subgraph( + &self, + loc: DeploymentLocator, + stop_block: Option, + ) -> Result<(), Error> { + let logger = self.logger_factory.subgraph_logger(&loc); + + let link_resolver = self + .link_resolver + .for_manifest(&loc.hash.to_string()) + .map_err(|e| Error::CreateLinkResolver { + loc: loc.cheap_clone(), + source: e, + })?; + + let file_bytes = link_resolver + .cat( + &LinkResolverContext::new(&loc.hash, &logger), + &loc.hash.to_ipfs_link(), + ) + .await + .map_err(|e| Error::LoadManifest { + loc: loc.cheap_clone(), + source: e, + })?; + + let raw_manifest: serde_yaml::Mapping = + serde_yaml::from_slice(&file_bytes).map_err(|e| Error::ParseManifest { + loc: loc.cheap_clone(), + source: e, + })?; + + let subgraph_kind = SubgraphProcessingKind::from_manifest(&raw_manifest); + self.assignments.set_subgraph_kind(&loc, subgraph_kind); + + let Some(instance_manager) = self.instance_managers.get(&subgraph_kind) else { + return Err(Error::GetManager { loc, subgraph_kind }); + }; + + instance_manager.start_subgraph(loc, stop_block).await; + Ok(()) + } +} + +#[async_trait::async_trait] +impl SubgraphInstanceManager for SubgraphProvider { + async fn start_subgraph(self: Arc, loc: DeploymentLocator, stop_block: Option) { + let logger = self + .logger_factory + .subgraph_logger(&loc) + .new(slog::o!("method" => "start_subgraph")); + + if self.assignments.is_assigned(&loc) { + debug!(logger, "Subgraph is already started"); + return; + } + + self.count_metrics.deployment_count.inc(); + + let handle = tokio::spawn({ + let provider = self.cheap_clone(); + let loc = loc.cheap_clone(); + let start_instant = Instant::now(); + + async move { + debug!(logger, "Starting subgraph"); + + let fut = provider.assign_and_start_subgraph(loc, stop_block); + match provider.cancel_token.run_until_cancelled(fut).await { + Some(Ok(())) => { + debug!(logger, "Subgraph started"; + "duration_ms" => start_instant.elapsed().as_millis() + ); + } + Some(Err(e)) => { + error!(logger, "Subgraph failed to start"; + "e" => ?e + ); + } + None => { + debug!(logger, "Subgraph start cancelled"); + } + } + } + }); + + self.assignments.add( + loc, + SubgraphAssignment { + handle, + subgraph_kind: None, + }, + ) + } + + async fn stop_subgraph(&self, loc: DeploymentLocator) { + let logger = self + .logger_factory + .subgraph_logger(&loc) + .new(slog::o!("method" => "stop_subgraph")); + + debug!(logger, "Stopping subgraph"); + + let Some(SubgraphAssignment { + handle, + subgraph_kind, + }) = self.assignments.take(&loc) + else { + debug!(logger, "Subgraph is not started"); + return; + }; + + handle.abort(); + self.count_metrics.deployment_count.dec(); + + let Some(subgraph_kind) = subgraph_kind else { + debug!(logger, "Unknown subgraph kind"); + return; + }; + + let Some(instance_manager) = self.instance_managers.get(&subgraph_kind) else { + debug!(logger, "Missing instance manager"); + return; + }; + + instance_manager.stop_subgraph(loc).await; + debug!(logger, "Subgraph stopped"); + } +} + +/// Enumerates all possible errors of the subgraph provider. +#[derive(Debug, thiserror::Error)] +enum Error { + #[error("failed to create link resolver for '{loc}': {source:#}")] + CreateLinkResolver { + loc: DeploymentLocator, + source: anyhow::Error, + }, + + #[error("failed to load manifest for '{loc}': {source:#}")] + LoadManifest { + loc: DeploymentLocator, + source: anyhow::Error, + }, + + #[error("failed to parse manifest for '{loc}': {source:#}")] + ParseManifest { + loc: DeploymentLocator, + source: serde_yaml::Error, + }, + + #[error("failed to get instance manager for '{loc}' with kind '{subgraph_kind}'")] + GetManager { + loc: DeploymentLocator, + subgraph_kind: SubgraphProcessingKind, + }, +} + +/// Contains a mapping of enabled subgraph instance managers by subgraph processing kinds. +/// +/// Before starting a subgraph, its processing kind is determined from the subgraph manifest. +/// Then, the appropriate instance manager is loaded from this mapping. +pub struct SubgraphInstanceManagers( + HashMap>, +); + +impl SubgraphInstanceManagers { + /// Creates a new empty subgraph instance manager mapping. + pub fn new() -> Self { + Self(HashMap::new()) + } + + /// Adds a new subgraph instance manager for all subgraphs of the specified processing kind. + pub fn add( + &mut self, + subgraph_kind: SubgraphProcessingKind, + instance_manager: Arc, + ) { + self.0.insert(subgraph_kind, instance_manager); + } + + /// Returns the subgraph instance manager for the specified processing kind. + pub fn get( + &self, + subgraph_kind: &SubgraphProcessingKind, + ) -> Option> { + self.0 + .get(subgraph_kind) + .map(|instance_manager| instance_manager.cheap_clone()) + } +} + +/// Enumerates the supported subgraph processing kinds. +/// +/// Subgraphs may have different processing requirements, and this enum helps to map them +/// to the appropriate instance managers. +#[derive(Debug, Clone, Copy, PartialEq, Eq, Hash, strum::Display)] +#[strum(serialize_all = "snake_case")] +pub enum SubgraphProcessingKind { + /// Represents trigger-based subgraphs. + Trigger, + + /// Represents Amp-powered subgraphs. + Amp, +} + +impl SubgraphProcessingKind { + /// Determines the subgraph processing kind from the subgraph manifest. + fn from_manifest(raw_manifest: &serde_yaml::Mapping) -> Self { + use serde_yaml::Value; + + let is_amp_manifest = raw_manifest + .get("dataSources") + .and_then(Value::as_sequence) + .and_then(|seq| { + seq.iter() + .filter_map(Value::as_mapping) + .filter_map(|map| map.get("kind")) + .filter_map(Value::as_str) + .filter(|kind| *kind == nozzle::manifest::DataSource::KIND) + .next() + }) + .is_some(); + + if is_amp_manifest { + return Self::Amp; + } + + Self::Trigger + } +} + +/// Maintains a list of started subgraph deployments with details required for stopping them. +struct SubgraphAssignments(RwLock>); + +impl SubgraphAssignments { + /// Creates a new empty list of started subgraph deployments. + fn new() -> Self { + Self(RwLock::new(HashMap::new())) + } + + /// Adds a new subgraph deployment to the list of started subgraph deployments. + fn add(&self, loc: DeploymentLocator, subgraph_assignment: SubgraphAssignment) { + self.0.write().insert(loc, subgraph_assignment); + } + + /// Updates the started subgraph deployment with the specified subgraph processing kind. + fn set_subgraph_kind(&self, loc: &DeploymentLocator, subgraph_kind: SubgraphProcessingKind) { + if let Some(subgraph_assignment) = self.0.write().get_mut(loc) { + subgraph_assignment.subgraph_kind = Some(subgraph_kind); + } + } + + /// Checks if the subgraph deployment is started. + fn is_assigned(&self, loc: &DeploymentLocator) -> bool { + self.0.read().contains_key(loc) + } + + /// Removes the subgraph deployment from the list of started subgraph deployments and returns its details. + fn take(&self, loc: &DeploymentLocator) -> Option { + self.0.write().remove(loc) + } +} + +/// Contains the details of a started subgraph deployment. +struct SubgraphAssignment { + /// The handle to the background task that starts this subgraph deployment. + handle: tokio::task::JoinHandle<()>, + + /// The subgraph processing kind of this subgraph deployment. + /// + /// Used to get the appropriate subgraph instance manager to forward the stop request to. + /// + /// Set to `None` until the subgraph manifest is loaded and parsed. + subgraph_kind: Option, +} diff --git a/graph/src/components/subgraph/instance_manager.rs b/graph/src/components/subgraph/instance_manager.rs index 7caf64626ba..d014366ead2 100644 --- a/graph/src/components/subgraph/instance_manager.rs +++ b/graph/src/components/subgraph/instance_manager.rs @@ -1,4 +1,3 @@ -use crate::prelude::BlockNumber; use std::sync::Arc; use crate::components::store::DeploymentLocator; @@ -10,17 +9,7 @@ use crate::components::store::DeploymentLocator; /// subgraph instance manager stops and removes the corresponding instance. #[async_trait::async_trait] pub trait SubgraphInstanceManager: Send + Sync + 'static { - /// Returns `true` if this manager has the necessary capabilities to manage the subgraph. - fn can_manage( - &self, - deployment: &DeploymentLocator, - raw_manifest: &serde_yaml::Mapping, - ) -> bool; + async fn start_subgraph(self: Arc, loc: DeploymentLocator, stop_block: Option); - async fn start_subgraph( - self: Arc, - deployment: DeploymentLocator, - stop_block: Option, - ); - async fn stop_subgraph(&self, deployment: DeploymentLocator); + async fn stop_subgraph(&self, loc: DeploymentLocator); } diff --git a/graph/src/components/subgraph/mod.rs b/graph/src/components/subgraph/mod.rs index 5bdea73ca45..02b6486b953 100644 --- a/graph/src/components/subgraph/mod.rs +++ b/graph/src/components/subgraph/mod.rs @@ -2,7 +2,6 @@ mod host; mod instance; mod instance_manager; mod proof_of_indexing; -mod provider; mod registrar; mod settings; @@ -15,6 +14,5 @@ pub use self::proof_of_indexing::{ PoICausalityRegion, ProofOfIndexing, ProofOfIndexingEvent, ProofOfIndexingFinisher, ProofOfIndexingVersion, SharedProofOfIndexing, }; -pub use self::provider::SubgraphAssignmentProvider; pub use self::registrar::{SubgraphRegistrar, SubgraphVersionSwitchingMode}; pub use self::settings::{Setting, Settings}; diff --git a/graph/src/components/subgraph/provider.rs b/graph/src/components/subgraph/provider.rs deleted file mode 100644 index 3e33f6fd5bf..00000000000 --- a/graph/src/components/subgraph/provider.rs +++ /dev/null @@ -1,10 +0,0 @@ -use async_trait::async_trait; - -use crate::{components::store::DeploymentLocator, prelude::*}; - -/// Common trait for subgraph providers. -#[async_trait] -pub trait SubgraphAssignmentProvider: Send + Sync + 'static { - async fn start(&self, deployment: DeploymentLocator, stop_block: Option); - async fn stop(&self, deployment: DeploymentLocator); -} diff --git a/graph/src/lib.rs b/graph/src/lib.rs index 03ee57b8e13..7095cede7a1 100644 --- a/graph/src/lib.rs +++ b/graph/src/lib.rs @@ -138,8 +138,7 @@ pub mod prelude { }; pub use crate::components::subgraph::{ BlockState, HostMetrics, InstanceDSTemplateInfo, RuntimeHost, RuntimeHostBuilder, - SubgraphAssignmentProvider, SubgraphInstanceManager, SubgraphRegistrar, - SubgraphVersionSwitchingMode, + SubgraphInstanceManager, SubgraphRegistrar, SubgraphVersionSwitchingMode, }; pub use crate::components::trigger_processor::TriggerProcessor; pub use crate::components::versions::{ApiVersion, FeatureFlag}; diff --git a/graph/src/nozzle/manifest/data_source/raw.rs b/graph/src/nozzle/manifest/data_source/raw.rs index a36994108e5..24a77942812 100644 --- a/graph/src/nozzle/manifest/data_source/raw.rs +++ b/graph/src/nozzle/manifest/data_source/raw.rs @@ -14,7 +14,8 @@ use thiserror::Error; use super::{Abi, DataSource, Source, Table, Transformer}; use crate::{ - components::link_resolver::LinkResolver, + components::link_resolver::{LinkResolver, LinkResolverContext}, + data::subgraph::DeploymentHash, nozzle::{ self, common::{column_aliases, Ident}, @@ -336,7 +337,10 @@ impl RawAbi { } let file_bytes = link_resolver - .cat(logger, &(file.into())) + .cat( + &LinkResolverContext::new(&DeploymentHash::default(), &logger), + &(file.into()), + ) .await .map_err(|e| Error::FailedToResolveFile(e.context("invalid `file`")))?; @@ -440,7 +444,10 @@ impl RawTable { } let file_bytes = link_resolver - .cat(logger, &(file.into())) + .cat( + &LinkResolverContext::new(&DeploymentHash::default(), logger), + &(file.into()), + ) .await .map_err(|e| Error::FailedToResolveFile(e.context("invalid `file`")))?; diff --git a/node/Cargo.toml b/node/Cargo.toml index 5b7f051efe1..c94bed08b5d 100644 --- a/node/Cargo.toml +++ b/node/Cargo.toml @@ -41,3 +41,6 @@ prometheus = { version = "0.14.0", features = ["push"] } json-structural-diff = { version = "0.2", features = ["colorize"] } globset = "0.4.16" notify = "8.2.0" + +# Dependencies related to Amp subgraphs +tokio-util.workspace = true diff --git a/node/src/launcher.rs b/node/src/launcher.rs index 8855ef1a954..410ce38293a 100644 --- a/node/src/launcher.rs +++ b/node/src/launcher.rs @@ -1,27 +1,22 @@ -use anyhow::Result; +use std::{ + io::{BufRead, BufReader}, + path::Path, + time::Duration, +}; +use anyhow::Result; use git_testament::{git_testament, render_testament}; -use graph::futures03::future::TryFutureExt; - -use crate::config::Config; -use crate::helpers::watch_subgraph_updates; -use crate::network_setup::Networks; -use crate::opt::Opt; -use crate::store_builder::StoreBuilder; use graph::blockchain::{Blockchain, BlockchainKind, BlockchainMap}; use graph::components::link_resolver::{ArweaveClient, FileSizeLimit}; use graph::components::subgraph::Settings; use graph::data::graphql::load_manager::LoadManager; use graph::endpoint::EndpointMetrics; use graph::env::EnvVars; +use graph::futures03::future::TryFutureExt; use graph::prelude::*; use graph::prometheus::Registry; use graph::url::Url; use graph_core::polling_monitor::{arweave_service, ArweaveService, IpfsService}; -use graph_core::{ - SubgraphAssignmentProvider as IpfsSubgraphAssignmentProvider, SubgraphInstanceManager, - SubgraphRegistrar as IpfsSubgraphRegistrar, -}; use graph_graphql::prelude::GraphQlRunner; use graph_server_http::GraphQLServer as GraphQLQueryServer; use graph_server_index_node::IndexNodeServer; @@ -33,11 +28,14 @@ use graph_store_postgres::{ }; use graphman_server::GraphmanServer; use graphman_server::GraphmanServerConfig; -use std::io::{BufRead, BufReader}; -use std::path::Path; -use std::time::Duration; use tokio::sync::mpsc; +use crate::config::Config; +use crate::helpers::watch_subgraph_updates; +use crate::network_setup::Networks; +use crate::opt::Opt; +use crate::store_builder::StoreBuilder; + git_testament!(TESTAMENT); /// Sets up metrics and monitoring @@ -269,8 +267,8 @@ fn build_subgraph_registrar( arweave_service: ArweaveService, ipfs_service: IpfsService, ) -> Arc< - IpfsSubgraphRegistrar< - IpfsSubgraphAssignmentProvider>, + graph_core::subgraph::SubgraphRegistrar< + graph_core::subgraph_provider::SubgraphProvider, SubgraphStore, SubscriptionManager, >, @@ -278,7 +276,7 @@ fn build_subgraph_registrar( let static_filters = ENV_VARS.experimental_static_filters; let sg_count = Arc::new(SubgraphCountMetric::new(metrics_registry.cheap_clone())); - let subgraph_instance_manager = SubgraphInstanceManager::new( + let subgraph_instance_manager = graph_core::subgraph::SubgraphInstanceManager::new( &logger_factory, env_vars.cheap_clone(), network_store.subgraph_store(), @@ -291,15 +289,27 @@ fn build_subgraph_registrar( static_filters, ); - // Create IPFS-based subgraph provider - let subgraph_provider = - IpfsSubgraphAssignmentProvider::new(&logger_factory, subgraph_instance_manager, sg_count); + let mut subgraph_instance_managers = + graph_core::subgraph_provider::SubgraphInstanceManagers::new(); + + subgraph_instance_managers.add( + graph_core::subgraph_provider::SubgraphProcessingKind::Trigger, + Arc::new(subgraph_instance_manager), + ); + + let subgraph_provider = graph_core::subgraph_provider::SubgraphProvider::new( + &logger_factory, + sg_count.cheap_clone(), + link_resolver.cheap_clone(), + tokio_util::sync::CancellationToken::new(), + subgraph_instance_managers, + ); // Check version switching mode environment variable let version_switching_mode = ENV_VARS.subgraph_version_switching_mode; // Create named subgraph provider for resolving subgraph name->ID mappings - let subgraph_registrar = Arc::new(IpfsSubgraphRegistrar::new( + let subgraph_registrar = Arc::new(graph_core::subgraph::SubgraphRegistrar::new( &logger_factory, link_resolver, Arc::new(subgraph_provider), diff --git a/node/src/manager/commands/run.rs b/node/src/manager/commands/run.rs index 060341fb6e0..bf4ade053e5 100644 --- a/node/src/manager/commands/run.rs +++ b/node/src/manager/commands/run.rs @@ -12,20 +12,16 @@ use graph::cheap_clone::CheapClone; use graph::components::link_resolver::{ArweaveClient, FileSizeLimit}; use graph::components::network_provider::chain_id_validator; use graph::components::store::DeploymentLocator; -use graph::components::subgraph::Settings; +use graph::components::subgraph::{Settings, SubgraphInstanceManager as _}; use graph::endpoint::EndpointMetrics; use graph::env::EnvVars; use graph::prelude::{ anyhow, tokio, BlockNumber, DeploymentHash, IpfsResolver, LoggerFactory, NodeId, - SubgraphAssignmentProvider, SubgraphCountMetric, SubgraphName, SubgraphRegistrar, - SubgraphStore, SubgraphVersionSwitchingMode, ENV_VARS, + SubgraphCountMetric, SubgraphName, SubgraphRegistrar, SubgraphStore, + SubgraphVersionSwitchingMode, ENV_VARS, }; use graph::slog::{debug, info, Logger}; use graph_core::polling_monitor::{arweave_service, ipfs_service}; -use graph_core::{ - SubgraphAssignmentProvider as IpfsSubgraphAssignmentProvider, SubgraphInstanceManager, - SubgraphRegistrar as IpfsSubgraphRegistrar, -}; fn locate(store: &dyn SubgraphStore, hash: &str) -> Result { let mut locators = store.locators(hash)?; @@ -139,10 +135,8 @@ pub async fn run( ); let static_filters = ENV_VARS.experimental_static_filters; - let sg_metrics = Arc::new(SubgraphCountMetric::new(metrics_registry.clone())); - - let subgraph_instance_manager = SubgraphInstanceManager::new( + let subgraph_instance_manager = graph_core::subgraph::SubgraphInstanceManager::new( &logger_factory, env_vars.cheap_clone(), subgraph_store.clone(), @@ -155,19 +149,28 @@ pub async fn run( static_filters, ); - // Create IPFS-based subgraph provider - let subgraph_provider = Arc::new(IpfsSubgraphAssignmentProvider::new( + let mut subgraph_instance_managers = + graph_core::subgraph_provider::SubgraphInstanceManagers::new(); + + subgraph_instance_managers.add( + graph_core::subgraph_provider::SubgraphProcessingKind::Trigger, + Arc::new(subgraph_instance_manager), + ); + + let subgraph_provider = Arc::new(graph_core::subgraph_provider::SubgraphProvider::new( &logger_factory, - subgraph_instance_manager, - sg_metrics, + sg_metrics.cheap_clone(), + link_resolver.cheap_clone(), + tokio_util::sync::CancellationToken::new(), + subgraph_instance_managers, )); let panicking_subscription_manager = Arc::new(PanicSubscriptionManager {}); - let subgraph_registrar = Arc::new(IpfsSubgraphRegistrar::new( + let subgraph_registrar = Arc::new(graph_core::subgraph::SubgraphRegistrar::new( &logger_factory, link_resolver.cheap_clone(), - subgraph_provider.clone(), + subgraph_provider.cheap_clone(), subgraph_store.clone(), panicking_subscription_manager, blockchain_map, @@ -216,7 +219,9 @@ pub async fn run( let locator = locate(subgraph_store.as_ref(), &hash)?; - SubgraphAssignmentProvider::start(subgraph_provider.as_ref(), locator, Some(stop_block)).await; + subgraph_provider + .start_subgraph(locator, Some(stop_block)) + .await; loop { tokio::time::sleep(Duration::from_millis(1000)).await; diff --git a/tests/Cargo.toml b/tests/Cargo.toml index 3d6a3771a93..268f7c3c6cd 100644 --- a/tests/Cargo.toml +++ b/tests/Cargo.toml @@ -24,6 +24,7 @@ tokio = { version = "1.45.1", features = ["rt", "macros", "process"] } # here needs to be kept in sync with the web3 version that the graph crate # uses until then secp256k1 = { version = "0.21", features = ["recovery"] } +tokio-util.workspace = true [dev-dependencies] anyhow = "1.0.100" diff --git a/tests/src/fixture/mod.rs b/tests/src/fixture/mod.rs index 362cef37f44..0deb580d3b2 100644 --- a/tests/src/fixture/mod.rs +++ b/tests/src/fixture/mod.rs @@ -24,7 +24,7 @@ use graph::components::link_resolver::{ use graph::components::metrics::MetricsRegistry; use graph::components::network_provider::ChainName; use graph::components::store::{DeploymentLocator, EthereumCallCache, SourceableStore}; -use graph::components::subgraph::Settings; +use graph::components::subgraph::{Settings, SubgraphInstanceManager as _}; use graph::data::graphql::load_manager::LoadManager; use graph::data::query::{Query, QueryTarget}; use graph::data::subgraph::schema::{SubgraphError, SubgraphHealth}; @@ -42,18 +42,14 @@ use graph::prelude::serde_json::{self, json}; use graph::prelude::{ async_trait, lazy_static, q, r, ApiVersion, BigInt, BlockNumber, DeploymentHash, GraphQlRunner as _, IpfsResolver, LinkResolver, LoggerFactory, NodeId, QueryError, - SubgraphAssignmentProvider, SubgraphCountMetric, SubgraphName, SubgraphRegistrar, - SubgraphStore as _, SubgraphVersionSwitchingMode, TriggerProcessor, + SubgraphCountMetric, SubgraphName, SubgraphRegistrar, SubgraphStore as _, + SubgraphVersionSwitchingMode, TriggerProcessor, }; use graph::schema::InputSchema; use graph_chain_ethereum::chain::RuntimeAdapterBuilder; use graph_chain_ethereum::network::EthereumNetworkAdapters; use graph_chain_ethereum::Chain; use graph_core::polling_monitor::{arweave_service, ipfs_service}; -use graph_core::{ - SubgraphAssignmentProvider as IpfsSubgraphAssignmentProvider, SubgraphInstanceManager, - SubgraphRegistrar as IpfsSubgraphRegistrar, SubgraphTriggerProcessor, -}; use graph_node::manager::PanicSubscriptionManager; use graph_node::{config::Config, store_builder::StoreBuilder}; use graph_runtime_wasm::RuntimeHostBuilder; @@ -158,15 +154,12 @@ pub trait TestChainTrait { pub struct TestContext { pub logger: Logger, - pub provider: Arc< - IpfsSubgraphAssignmentProvider< - SubgraphInstanceManager, - >, - >, + pub provider: Arc, pub store: Arc, pub deployment: DeploymentLocator, pub subgraph_name: SubgraphName, - pub instance_manager: SubgraphInstanceManager, + pub instance_manager: + Arc>, pub link_resolver: Arc, pub arweave_resolver: Arc, pub env_vars: Arc, @@ -204,12 +197,13 @@ impl TestContext { pub async fn runner( &self, stop_block: BlockPtr, - ) -> graph_core::SubgraphRunner< + ) -> graph_core::subgraph::SubgraphRunner< graph_chain_ethereum::Chain, RuntimeHostBuilder, > { let (logger, deployment, raw) = self.get_runner_context().await; - let tp: Box> = Box::new(SubgraphTriggerProcessor {}); + let tp: Box> = + Box::new(graph_core::subgraph::SubgraphTriggerProcessor {}); let deployment_status_metric = self .instance_manager @@ -233,7 +227,7 @@ impl TestContext { pub async fn runner_substreams( &self, stop_block: BlockPtr, - ) -> graph_core::SubgraphRunner< + ) -> graph_core::subgraph::SubgraphRunner< graph_chain_substreams::Chain, RuntimeHostBuilder, > { @@ -282,10 +276,13 @@ impl TestContext { pub async fn start_and_sync_to(&self, stop_block: BlockPtr) { // In case the subgraph has been previously started. - self.provider.stop(self.deployment.clone()).await; + self.provider + .stop_subgraph(self.deployment.cheap_clone()) + .await; self.provider - .start(self.deployment.clone(), Some(stop_block.number)) + .cheap_clone() + .start_subgraph(self.deployment.cheap_clone(), Some(stop_block.number)) .await; debug!(self.logger, "TEST: syncing to {}", stop_block.number); @@ -302,9 +299,14 @@ impl TestContext { pub async fn start_and_sync_to_error(&self, stop_block: BlockPtr) -> SubgraphError { // In case the subgraph has been previously started. - self.provider.stop(self.deployment.clone()).await; + self.provider + .stop_subgraph(self.deployment.cheap_clone()) + .await; - self.provider.start(self.deployment.clone(), None).await; + self.provider + .cheap_clone() + .start_subgraph(self.deployment.cheap_clone(), None) + .await; wait_for_sync( &self.logger, @@ -542,7 +544,8 @@ pub async fn setup_inner( let sg_count = Arc::new(SubgraphCountMetric::new(mock_registry.cheap_clone())); let blockchain_map = Arc::new(blockchain_map); - let subgraph_instance_manager = SubgraphInstanceManager::new( + + let subgraph_instance_manager = Arc::new(graph_core::subgraph::SubgraphInstanceManager::new( &logger_factory, env_vars.cheap_clone(), subgraph_store.clone(), @@ -553,8 +556,24 @@ pub async fn setup_inner( ipfs_service, arweave_service, static_filters, + )); + + let mut subgraph_instance_managers = + graph_core::subgraph_provider::SubgraphInstanceManagers::new(); + + subgraph_instance_managers.add( + graph_core::subgraph_provider::SubgraphProcessingKind::Trigger, + subgraph_instance_manager.cheap_clone(), ); + let subgraph_provider = Arc::new(graph_core::subgraph_provider::SubgraphProvider::new( + &logger_factory, + sg_count.cheap_clone(), + link_resolver.cheap_clone(), + tokio_util::sync::CancellationToken::new(), + subgraph_instance_managers, + )); + // Graphql runner let load_manager = LoadManager::new(&logger, Vec::new(), Vec::new(), mock_registry.clone()); let graphql_runner = Arc::new(GraphQlRunner::new( @@ -571,19 +590,12 @@ pub async fn setup_inner( link_resolver.cheap_clone(), )); - // Create IPFS-based subgraph provider - let subgraph_provider = Arc::new(IpfsSubgraphAssignmentProvider::new( - &logger_factory, - subgraph_instance_manager.clone(), - sg_count, - )); - let panicking_subscription_manager = Arc::new(PanicSubscriptionManager {}); - let subgraph_registrar = Arc::new(IpfsSubgraphRegistrar::new( + let subgraph_registrar = Arc::new(graph_core::subgraph::SubgraphRegistrar::new( &logger_factory, link_resolver.cheap_clone(), - subgraph_provider.clone(), + subgraph_provider.cheap_clone(), subgraph_store.clone(), panicking_subscription_manager, blockchain_map.clone(), diff --git a/tests/tests/runner_tests.rs b/tests/tests/runner_tests.rs index cd2c059e2dc..99dacf63b84 100644 --- a/tests/tests/runner_tests.rs +++ b/tests/tests/runner_tests.rs @@ -7,6 +7,7 @@ use std::time::Duration; use assert_json_diff::assert_json_eq; use graph::blockchain::block_stream::BlockWithTriggers; use graph::blockchain::{Block, BlockPtr, Blockchain}; +use graph::components::subgraph::SubgraphInstanceManager as _; use graph::data::store::scalar::Bytes; use graph::data::subgraph::schema::{SubgraphError, SubgraphHealth}; use graph::data::value::Word; @@ -16,7 +17,7 @@ use graph::ipfs::test_utils::add_files_to_local_ipfs_node_for_testing; use graph::object; use graph::prelude::ethabi::ethereum_types::H256; use graph::prelude::web3::types::Address; -use graph::prelude::{hex, CheapClone, SubgraphAssignmentProvider, SubgraphName, SubgraphStore}; +use graph::prelude::{hex, CheapClone, SubgraphName, SubgraphStore}; use graph_tests::fixture::ethereum::{ chain, empty_block, generate_empty_blocks_for_range, genesis, push_test_command, push_test_log, push_test_polling_trigger, @@ -82,7 +83,10 @@ async fn data_source_revert() -> anyhow::Result<()> { let stop_block = test_ptr(2); base_ctx.start_and_sync_to(stop_block).await; - base_ctx.provider.stop(base_ctx.deployment.clone()).await; + base_ctx + .provider + .stop_subgraph(base_ctx.deployment.clone()) + .await; // Test loading data sources from DB. let stop_block = test_ptr(3); From efb7c5972a50b6749ffac939f81f5b909f5dd3fc Mon Sep 17 00:00:00 2001 From: Ion Suman <47307091+isum@users.noreply.github.com> Date: Tue, 28 Oct 2025 17:19:39 +0200 Subject: [PATCH 11/40] fix(graph): update deterministic error patterns in Nozzle Flight client --- graph/src/nozzle/client/flight_client.rs | 19 +++++++++---------- graph/src/nozzle/client/mod.rs | 4 +++- 2 files changed, 12 insertions(+), 11 deletions(-) diff --git a/graph/src/nozzle/client/flight_client.rs b/graph/src/nozzle/client/flight_client.rs index 19d4981388d..f0f1d800196 100644 --- a/graph/src/nozzle/client/flight_client.rs +++ b/graph/src/nozzle/client/flight_client.rs @@ -16,7 +16,6 @@ use async_stream::try_stream; use bytes::Bytes; use futures03::{future::BoxFuture, stream::BoxStream, StreamExt}; use http::Uri; -use lazy_regex::regex_is_match; use serde::{Deserialize, Serialize}; use slog::{debug, trace, Logger}; use thiserror::Error; @@ -191,7 +190,7 @@ impl Client for FlightClient { yield ResponseBatch::Reorg(reorg); } - yield ResponseBatch::Batch { data: record_batch}; + yield ResponseBatch::Batch { data: record_batch }; batch_index += 1; prev_block_ranges = block_ranges; @@ -231,9 +230,13 @@ impl error::IsDeterministic for Error { }; static DETERMINISTIC_ERROR_PATTERNS: &[&str] = &[ - r#", message: "SQL parse error:"#, - r#", message: "error looking up datasets:"#, - r#", message: "planning error:"#, + // Example SQL query: SELECT; + r#"code: InvalidArgument, message: ""#, + // Example SQL query: SELECT * FROM invalid_dataset; + // SELECT * FROM valid_dataset.invalid_table; + r#"code: Internal, message: "error creating planning context: "#, + // Example SQL query: SELECT invalid_column FROM valid_dataset.valid_table; + r#"code: Internal, message: "planning error: "#, ]; for &pattern in DETERMINISTIC_ERROR_PATTERNS { @@ -242,10 +245,6 @@ impl error::IsDeterministic for Error { } } - if regex_is_match!(r#", message: "dataset '.*?' not found, full error:"#, &msg) { - return true; - } - false } } @@ -363,7 +362,7 @@ fn detect_reorg( { return Some(LatestBlockBeforeReorg { network: block_range.network.clone(), - block_number: block_range.start().saturating_sub(1), + block_number: block_range.start().checked_sub(1), block_hash: block_range.prev_hash, }); } diff --git a/graph/src/nozzle/client/mod.rs b/graph/src/nozzle/client/mod.rs index f4c2fddffd5..0832ccf8864 100644 --- a/graph/src/nozzle/client/mod.rs +++ b/graph/src/nozzle/client/mod.rs @@ -72,7 +72,9 @@ pub struct LatestBlockBeforeReorg { pub network: String, /// Block number of the parent block of the first block after the reorg. - pub block_number: BlockNumber, + /// + /// It is `None` when the reorg affects every block in the blockchain. + pub block_number: Option, /// Block hash of the parent block of the first block after the reorg. /// From 5a2c3afd83bc666ef29ee9d886fe726cd5f02ef2 Mon Sep 17 00:00:00 2001 From: Ion Suman <47307091+isum@users.noreply.github.com> Date: Tue, 28 Oct 2025 17:19:39 +0200 Subject: [PATCH 12/40] feat(graph): add Nozzle related ENV variables --- graph/src/env/mod.rs | 17 ++++++++++- graph/src/env/nozzle.rs | 65 +++++++++++++++++++++++++++++++++++++++++ 2 files changed, 81 insertions(+), 1 deletion(-) create mode 100644 graph/src/env/nozzle.rs diff --git a/graph/src/env/mod.rs b/graph/src/env/mod.rs index 3fce087986e..4ba26444d51 100644 --- a/graph/src/env/mod.rs +++ b/graph/src/env/mod.rs @@ -1,11 +1,13 @@ mod graphql; mod mappings; +mod nozzle; mod store; +use std::{collections::HashSet, env::VarError, fmt, str::FromStr, sync::Arc, time::Duration}; + use envconfig::Envconfig; use lazy_static::lazy_static; use semver::Version; -use std::{collections::HashSet, env::VarError, fmt, str::FromStr, time::Duration}; use self::graphql::*; use self::mappings::*; @@ -15,6 +17,8 @@ use crate::{ runtime::gas::CONST_MAX_GAS_PER_HANDLER, }; +pub use self::nozzle::NozzleEnv; + #[cfg(debug_assertions)] use std::sync::Mutex; @@ -50,6 +54,7 @@ pub struct EnvVars { pub graphql: EnvVarsGraphQl, pub mappings: EnvVarsMapping, pub store: EnvVarsStore, + pub nozzle: Arc, /// Enables query throttling when getting database connections goes over this value. /// Load management can be disabled by setting this to 0. @@ -296,6 +301,7 @@ impl EnvVars { graphql, mappings: mapping_handlers, store, + nozzle: Arc::new(NozzleEnv::new(&inner)), load_threshold: Duration::from_millis(inner.load_threshold_in_ms), load_jail_threshold: inner.load_jail_threshold, @@ -587,6 +593,15 @@ struct Inner { default = "false" )] disable_deployment_hash_validation: EnvVarBoolean, + + #[envconfig(from = "GRAPH_NOZZLE_MAX_BUFFER_SIZE")] + nozzle_max_buffer_size: Option, + #[envconfig(from = "GRAPH_NOZZLE_MAX_BLOCK_RANGE")] + nozzle_max_block_range: Option, + #[envconfig(from = "GRAPH_NOZZLE_QUERY_RETRY_MIN_DELAY_SECONDS")] + nozzle_query_retry_min_delay_seconds: Option, + #[envconfig(from = "GRAPH_NOZZLE_QUERY_RETRY_MAX_DELAY_SECONDS")] + nozzle_query_retry_max_delay_seconds: Option, } #[derive(Clone, Debug)] diff --git a/graph/src/env/nozzle.rs b/graph/src/env/nozzle.rs new file mode 100644 index 00000000000..a64a691d447 --- /dev/null +++ b/graph/src/env/nozzle.rs @@ -0,0 +1,65 @@ +use std::time::Duration; + +/// Contains environment variables related to Nozzle subgraphs. +#[derive(Debug)] +pub struct NozzleEnv { + /// Maximum number of record batches to buffer in memory per stream for each SQL query. + /// This is the maximum number of record batches that can be output by a single block. + /// + /// Defaults to `1,000`. + pub max_buffer_size: usize, + + /// Maximum number of blocks to request per stream for each SQL query. + /// Limiting this value reduces load on the Nozzle server when processing heavy queries. + /// + /// Defaults to `2,000,000`. + pub max_block_range: usize, + + /// Minimum time to wait before retrying a failed SQL query to the Nozzle server. + /// + /// Defaults to `1` second. + pub query_retry_min_delay: Duration, + + /// Maximum time to wait before retrying a failed SQL query to the Nozzle server. + /// + /// Defaults to `600` seconds. + pub query_retry_max_delay: Duration, +} + +impl NozzleEnv { + const DEFAULT_MAX_BUFFER_SIZE: usize = 1_000; + const DEFAULT_MAX_BLOCK_RANGE: usize = 2_000_000; + const DEFAULT_QUERY_RETRY_MIN_DELAY: Duration = Duration::from_secs(1); + const DEFAULT_QUERY_RETRY_MAX_DELAY: Duration = Duration::from_secs(600); + + pub(super) fn new(raw_env: &super::Inner) -> Self { + Self { + max_buffer_size: raw_env + .nozzle_max_buffer_size + .and_then(|value| { + if value == 0 { + return None; + } + Some(value) + }) + .unwrap_or(Self::DEFAULT_MAX_BUFFER_SIZE), + max_block_range: raw_env + .nozzle_max_block_range + .and_then(|mut value| { + if value == 0 { + value = usize::MAX; + } + Some(value) + }) + .unwrap_or(Self::DEFAULT_MAX_BLOCK_RANGE), + query_retry_min_delay: raw_env + .nozzle_query_retry_min_delay_seconds + .map(Duration::from_secs) + .unwrap_or(Self::DEFAULT_QUERY_RETRY_MIN_DELAY), + query_retry_max_delay: raw_env + .nozzle_query_retry_max_delay_seconds + .map(Duration::from_secs) + .unwrap_or(Self::DEFAULT_QUERY_RETRY_MAX_DELAY), + } + } +} From e5b7898eb99437a37990cd918fcc6fa77d3150bb Mon Sep 17 00:00:00 2001 From: Ion Suman <47307091+isum@users.noreply.github.com> Date: Tue, 28 Oct 2025 17:19:39 +0200 Subject: [PATCH 13/40] fix(graph): make block range filter return a new query --- graph/src/nozzle/sql/query/filter_blocks.rs | 21 +++++++++++---------- graph/src/nozzle/sql/query/mod.rs | 20 ++++++++------------ 2 files changed, 19 insertions(+), 22 deletions(-) diff --git a/graph/src/nozzle/sql/query/filter_blocks.rs b/graph/src/nozzle/sql/query/filter_blocks.rs index 78e29eaca58..de1eec59edd 100644 --- a/graph/src/nozzle/sql/query/filter_blocks.rs +++ b/graph/src/nozzle/sql/query/filter_blocks.rs @@ -1,4 +1,7 @@ -use std::{collections::BTreeMap, ops::ControlFlow}; +use std::{ + collections::BTreeMap, + ops::{ControlFlow, RangeInclusive}, +}; use alloy::primitives::BlockNumber; use sqlparser_latest::ast::{self, VisitMut, VisitorMut}; @@ -19,8 +22,7 @@ pub(super) fn filter_blocks( query: &mut ast::Query, dataset: &Ident, tables: &[Ident], - start_block: BlockNumber, - end_block: BlockNumber, + block_range: &RangeInclusive, ) { let tables_to_cte_mapping = tables_to_cte_mapping(dataset, tables); @@ -35,8 +37,7 @@ pub(super) fn filter_blocks( &mut with.cte_tables, dataset, &tables_to_cte_mapping, - start_block, - end_block, + block_range, ); } None => { @@ -46,8 +47,7 @@ pub(super) fn filter_blocks( &mut cte_tables, dataset, &tables_to_cte_mapping, - start_block, - end_block, + block_range, ); query.with = Some(ast::With { @@ -81,14 +81,15 @@ fn add_cte_filters( ctes: &mut Vec, dataset: &Ident, tables_to_cte_mapping: &BTreeMap, - start_block: BlockNumber, - end_block: BlockNumber, + block_range: &RangeInclusive, ) { let mut output_ctes = Vec::with_capacity(ctes.len() + tables_to_cte_mapping.len()); for (table, cte_table) in tables_to_cte_mapping { let query = parse::query(format!( - "SELECT * FROM {dataset}.{table} WHERE _block_num BETWEEN {start_block} AND {end_block} ORDER BY _block_num ASC" + "SELECT * FROM {dataset}.{table} WHERE _block_num BETWEEN {} AND {} ORDER BY _block_num ASC", + block_range.start(), + block_range.end() )) .unwrap(); diff --git a/graph/src/nozzle/sql/query/mod.rs b/graph/src/nozzle/sql/query/mod.rs index c411ab74846..28b3496ea93 100644 --- a/graph/src/nozzle/sql/query/mod.rs +++ b/graph/src/nozzle/sql/query/mod.rs @@ -3,7 +3,7 @@ mod resolve_event_signatures; mod resolve_source_address; mod validate_tables; -use std::fmt; +use std::{fmt, ops::RangeInclusive, sync::Arc}; use alloy::{ json_abi::JsonAbi, @@ -28,7 +28,7 @@ pub struct Query { dataset: Ident, /// The tables that the SQL query requests data from. - tables: Vec, + tables: Arc<[Ident]>, } impl Query { @@ -58,11 +58,11 @@ impl Query { Ok(Self { ast: query, dataset: dataset.cheap_clone(), - tables: tables.to_vec(), + tables: tables.into(), }) } - /// Applies a block range filter to this SQL query. + /// Applies a block range filter to this SQL query and returns the updated query. /// /// Creates temporary ordered result sets for each table in the dataset, limiting /// the blocks processed during execution. @@ -71,14 +71,10 @@ impl Query { /// /// This ensures deterministic output during query execution and enables resuming /// after failures or when new blocks are available. - pub fn filter_blocks(&mut self, start_block: BlockNumber, end_block: BlockNumber) { - filter_blocks::filter_blocks( - &mut self.ast, - &self.dataset, - &self.tables, - start_block, - end_block, - ); + pub fn with_block_range_filter(&self, block_range: &RangeInclusive) -> Self { + let mut query = self.clone(); + filter_blocks::filter_blocks(&mut query.ast, &query.dataset, &query.tables, &block_range); + query } /// Validates the SQL query. From 71829d429ef043181f2fac67671cdbf7a4a5f565 Mon Sep 17 00:00:00 2001 From: Ion Suman <47307091+isum@users.noreply.github.com> Date: Tue, 28 Oct 2025 17:19:39 +0200 Subject: [PATCH 14/40] feat(graph): add decoding utilities --- graph/src/nozzle/codec/array_decoder.rs | 4 +- graph/src/nozzle/codec/decoder.rs | 2 +- graph/src/nozzle/codec/mod.rs | 9 +-- graph/src/nozzle/codec/utils.rs | 90 +++++++++++++++++++++++++ 4 files changed, 98 insertions(+), 7 deletions(-) create mode 100644 graph/src/nozzle/codec/utils.rs diff --git a/graph/src/nozzle/codec/array_decoder.rs b/graph/src/nozzle/codec/array_decoder.rs index d0f5bf12438..8de0cbb2702 100644 --- a/graph/src/nozzle/codec/array_decoder.rs +++ b/graph/src/nozzle/codec/array_decoder.rs @@ -19,7 +19,7 @@ use super::decoder::Decoder; use crate::data::store::scalar::{BigDecimal, BigInt}; /// Decodes Arrow arrays into Subgraph types. -pub(super) struct ArrayDecoder<'a, T: 'static>(&'a T); +pub struct ArrayDecoder<'a, T: 'static>(&'a T); impl<'a, T> ArrayDecoder<'a, T> where @@ -32,7 +32,7 @@ where /// Returns an error if the `array` cannot be downcasted to type `T`. /// /// The returned error is deterministic. - pub(super) fn new(array: &'a dyn Array) -> Result { + pub fn new(array: &'a dyn Array) -> Result { Ok(Self(downcast_ref(array)?)) } } diff --git a/graph/src/nozzle/codec/decoder.rs b/graph/src/nozzle/codec/decoder.rs index 6d433ba86f2..c0c479ab292 100644 --- a/graph/src/nozzle/codec/decoder.rs +++ b/graph/src/nozzle/codec/decoder.rs @@ -5,7 +5,7 @@ use anyhow::Result; /// This trait provides a common interface for converting Arrow format data into /// custom types. Implementations handle the specifics of extracting data from /// Arrow arrays and constructing the target type `T`. -pub(super) trait Decoder { +pub trait Decoder { /// Decodes and returns the value at the `row_index`. /// /// # Errors diff --git a/graph/src/nozzle/codec/mod.rs b/graph/src/nozzle/codec/mod.rs index 185a0978c92..a116c06c77f 100644 --- a/graph/src/nozzle/codec/mod.rs +++ b/graph/src/nozzle/codec/mod.rs @@ -5,15 +5,14 @@ mod mapping_decoder; mod name_cache; mod value_decoder; +pub mod utils; + use std::collections::{BTreeMap, HashMap}; use anyhow::{anyhow, bail, Context, Result}; use arrow::array::{Array, RecordBatch}; -use self::{ - array_decoder::ArrayDecoder, decoder::Decoder, list_decoder::ListDecoder, - mapping_decoder::MappingDecoder, name_cache::NameCache, -}; +use self::{list_decoder::ListDecoder, mapping_decoder::MappingDecoder, name_cache::NameCache}; use crate::{ data::{ graphql::TypeExt, @@ -24,6 +23,8 @@ use crate::{ schema::{EntityKey, EntityType, Field, InputSchema}, }; +pub use self::{array_decoder::ArrayDecoder, decoder::Decoder}; + /// Handles decoding of record batches to Subgraph entities. pub struct Codec { input_schema: InputSchema, diff --git a/graph/src/nozzle/codec/utils.rs b/graph/src/nozzle/codec/utils.rs new file mode 100644 index 00000000000..bf4581f93ca --- /dev/null +++ b/graph/src/nozzle/codec/utils.rs @@ -0,0 +1,90 @@ +use alloy::primitives::{BlockHash, BlockNumber}; +use anyhow::{bail, Context, Result}; +use arrow::array::{ + Array, FixedSizeBinaryArray, RecordBatch, TimestampNanosecondArray, UInt64Array, +}; +use chrono::{DateTime, Utc}; + +use super::{ArrayDecoder, Decoder}; +use crate::nozzle::common::column_aliases; + +pub fn auto_block_number_decoder<'a>( + record_batch: &'a RecordBatch, +) -> Result> + 'a>> { + let column_index = column_index(record_batch, column_aliases::BLOCK_NUMBER) + .context("failed to find block numbers column")?; + + block_number_decoder(record_batch, column_index) +} + +pub fn block_number_decoder<'a>( + record_batch: &'a RecordBatch, + column_index: usize, +) -> Result> + 'a>> { + column_decoder::(record_batch, column_index) +} + +pub fn auto_block_hash_decoder<'a>( + record_batch: &'a RecordBatch, +) -> Result> + 'a>> { + let column_index = column_index(record_batch, column_aliases::BLOCK_HASH) + .context("failed to find block hashes column")?; + + block_hash_decoder(record_batch, column_index) +} + +pub fn block_hash_decoder<'a>( + record_batch: &'a RecordBatch, + column_index: usize, +) -> Result> + 'a>> { + column_decoder::(record_batch, column_index) +} + +pub fn auto_block_timestamp_decoder<'a>( + record_batch: &'a RecordBatch, +) -> Result>> + 'a>> { + let column_index = column_index(record_batch, column_aliases::BLOCK_TIMESTAMP) + .context("failed to find block timestamps column")?; + + block_timestamp_decoder(record_batch, column_index) +} + +pub fn block_timestamp_decoder<'a>( + record_batch: &'a RecordBatch, + column_index: usize, +) -> Result>> + 'a>> { + column_decoder::>(record_batch, column_index) +} + +pub fn column_index( + record_batch: &RecordBatch, + column_names: impl IntoIterator>, +) -> Option { + let schema_ref = record_batch.schema_ref(); + + for column_name in column_names { + if let Some((column_index, _)) = schema_ref.column_with_name(column_name.as_ref()) { + return Some(column_index); + } + } + + return None; +} + +pub fn column_decoder<'a, T: 'static, U>( + record_batch: &'a RecordBatch, + column_index: usize, +) -> Result> + 'a>> +where + T: Array, + ArrayDecoder<'a, T>: Decoder>, +{ + if column_index >= record_batch.num_columns() { + bail!("column {column_index} does not exist"); + } + + let array = record_batch.column(column_index); + let decoder = ArrayDecoder::::new(array)?; + + Ok(Box::new(decoder)) +} From b0d0bcd6bd67c2cf343eae0f0a35583553eb2491 Mon Sep 17 00:00:00 2001 From: Ion Suman <47307091+isum@users.noreply.github.com> Date: Tue, 28 Oct 2025 17:19:39 +0200 Subject: [PATCH 15/40] fix(graph): use decoding utilities in the stream aggregator --- .../stream_aggregator/record_batch/decoder.rs | 64 +++++-------------- 1 file changed, 16 insertions(+), 48 deletions(-) diff --git a/graph/src/nozzle/stream_aggregator/record_batch/decoder.rs b/graph/src/nozzle/stream_aggregator/record_batch/decoder.rs index 5c2d69c697f..26d056c623d 100644 --- a/graph/src/nozzle/stream_aggregator/record_batch/decoder.rs +++ b/graph/src/nozzle/stream_aggregator/record_batch/decoder.rs @@ -1,16 +1,19 @@ use alloy::primitives::{BlockHash, BlockNumber}; -use anyhow::{bail, Context, Result}; -use arrow::array::{Array, FixedSizeBinaryArray, RecordBatch, UInt64Array}; +use anyhow::{anyhow, Result}; +use arrow::array::RecordBatch; -use crate::nozzle::common::column_aliases; +use crate::nozzle::codec::{ + self, + utils::{auto_block_hash_decoder, auto_block_number_decoder}, +}; /// Decodes the data required for stream aggregation. pub(super) struct Decoder<'a> { /// Block numbers serve as group keys for related record batches. - block_number_column: &'a UInt64Array, + block_number: Box> + 'a>, /// Block hashes ensure data consistency across tables and datasets. - block_hash_column: &'a FixedSizeBinaryArray, + block_hash: Box> + 'a>, } impl<'a> Decoder<'a> { @@ -24,8 +27,8 @@ impl<'a> Decoder<'a> { /// The returned error is deterministic. pub(super) fn new(record_batch: &'a RecordBatch) -> Result { Ok(Self { - block_number_column: block_number_column(record_batch)?, - block_hash_column: block_hash_column(record_batch)?, + block_number: auto_block_number_decoder(record_batch)?, + block_hash: auto_block_hash_decoder(record_batch)?, }) } @@ -38,11 +41,9 @@ impl<'a> Decoder<'a> { /// /// The returned error is deterministic. pub(super) fn block_number(&self, row_index: usize) -> Result { - if self.block_number_column.is_null(row_index) { - bail!("block number is null"); - } - - Ok(self.block_number_column.value(row_index)) + self.block_number + .decode(row_index)? + .ok_or_else(|| anyhow!("block number is empty")) } /// Returns the block hash at `row_index`. @@ -54,41 +55,8 @@ impl<'a> Decoder<'a> { /// /// The returned error is deterministic. pub(super) fn block_hash(&self, row_index: usize) -> Result { - if self.block_hash_column.is_null(row_index) { - bail!("block hash is null"); - } - - BlockHash::try_from(self.block_hash_column.value(row_index)) - .context("block hash is invalid") - } -} - -fn block_number_column<'a>(record_batch: &'a RecordBatch) -> Result<&'a UInt64Array> { - for &column_name in column_aliases::BLOCK_NUMBER { - let Some(column) = record_batch.column_by_name(column_name) else { - continue; - }; - - return column - .as_any() - .downcast_ref() - .context("failed to downcast block number column"); + self.block_hash + .decode(row_index)? + .ok_or_else(|| anyhow!("block hash is empty")) } - - bail!("failed to find block number column"); -} - -fn block_hash_column<'a>(record_batch: &'a RecordBatch) -> Result<&'a FixedSizeBinaryArray> { - for &column_name in column_aliases::BLOCK_HASH { - let Some(column) = record_batch.column_by_name(column_name) else { - continue; - }; - - return column - .as_any() - .downcast_ref() - .context("failed to downcast block hash column"); - } - - bail!("failed to find block hash column"); } From 6a0930ef67567c800ee83b45a217110bc6e333fb Mon Sep 17 00:00:00 2001 From: Ion Suman <47307091+isum@users.noreply.github.com> Date: Tue, 28 Oct 2025 17:19:39 +0200 Subject: [PATCH 16/40] feat(graph): add more details to Nozzle data sources --- graph/src/data/subgraph/api_version.rs | 5 ++++- graph/src/env/mod.rs | 2 +- graph/src/nozzle/common/mod.rs | 17 ++++++++++++++--- graph/src/nozzle/manifest/data_source/mod.rs | 15 ++++++++++++++- graph/src/nozzle/manifest/data_source/raw.rs | 18 +++++++++++++++--- 5 files changed, 48 insertions(+), 9 deletions(-) diff --git a/graph/src/data/subgraph/api_version.rs b/graph/src/data/subgraph/api_version.rs index dad1469c7b4..163909adabb 100644 --- a/graph/src/data/subgraph/api_version.rs +++ b/graph/src/data/subgraph/api_version.rs @@ -66,8 +66,11 @@ pub const SPEC_VERSION_1_3_0: Version = Version::new(1, 3, 0); // Enables struct field access in declarative calls pub const SPEC_VERSION_1_4_0: Version = Version::new(1, 4, 0); +// Enables support for Amp data sources; +pub const SPEC_VERSION_1_5_0: Version = Version::new(1, 5, 0); + // The latest spec version available -pub const LATEST_VERSION: &Version = &SPEC_VERSION_1_4_0; +pub const LATEST_VERSION: &Version = &SPEC_VERSION_1_5_0; pub const MIN_SPEC_VERSION: Version = Version::new(0, 0, 2); diff --git a/graph/src/env/mod.rs b/graph/src/env/mod.rs index 4ba26444d51..240c0423757 100644 --- a/graph/src/env/mod.rs +++ b/graph/src/env/mod.rs @@ -475,7 +475,7 @@ struct Inner { default = "false" )] allow_non_deterministic_fulltext_search: EnvVarBoolean, - #[envconfig(from = "GRAPH_MAX_SPEC_VERSION", default = "1.4.0")] + #[envconfig(from = "GRAPH_MAX_SPEC_VERSION", default = "1.5.0")] max_spec_version: Version, #[envconfig(from = "GRAPH_LOAD_WINDOW_SIZE", default = "300")] load_window_size_in_secs: u64, diff --git a/graph/src/nozzle/common/mod.rs b/graph/src/nozzle/common/mod.rs index 9075a8edbea..fab138d7d3d 100644 --- a/graph/src/nozzle/common/mod.rs +++ b/graph/src/nozzle/common/mod.rs @@ -3,7 +3,18 @@ mod ident; pub use self::ident::Ident; pub(super) mod column_aliases { - pub(in crate::nozzle) static BLOCK_NUMBER: &[&str] = &["_block_num", "block_num"]; - pub(in crate::nozzle) static BLOCK_HASH: &[&str] = &["hash", "block_hash"]; - pub(in crate::nozzle) static BLOCK_TIMESTAMP: &[&str] = &["timestamp"]; + pub(in crate::nozzle) static BLOCK_NUMBER: &[&str] = &[ + "_block_num", // Meta column present in all tables + "block_num", // Standard column in most raw tables + "block", // Common alternative name + "block_number", // Common alternative name + ]; + pub(in crate::nozzle) static BLOCK_HASH: &[&str] = &[ + "hash", // Standard column in some raw tables + "block_hash", // Standard column in most raw tables and common alternative name + ]; + pub(in crate::nozzle) static BLOCK_TIMESTAMP: &[&str] = &[ + "timestamp", // Standard column in most raw tables + "block_timestamp", // Common alternative name + ]; } diff --git a/graph/src/nozzle/manifest/data_source/mod.rs b/graph/src/nozzle/manifest/data_source/mod.rs index 121ca11f88f..bcca6aed7b4 100644 --- a/graph/src/nozzle/manifest/data_source/mod.rs +++ b/graph/src/nozzle/manifest/data_source/mod.rs @@ -5,8 +5,14 @@ use alloy::{ primitives::{Address, BlockNumber}, }; use arrow::datatypes::Schema; +use semver::Version; -use crate::nozzle::{common::Ident, sql::Query}; +use crate::{ + data::subgraph::SPEC_VERSION_1_5_0, + nozzle::{common::Ident, sql::Query}, +}; + +pub use self::raw::RawDataSource; /// Represents a valid data source of a Nozzle Subgraph. /// @@ -18,6 +24,9 @@ pub struct DataSource { /// Used for observability to identify progress and errors produced by this data source. pub name: Ident, + /// The network name of the data source. + pub network: String, + /// Contains the sources used by this data source. pub source: Source, @@ -27,6 +36,7 @@ pub struct DataSource { impl DataSource { pub const KIND: &str = "nozzle"; + pub const MIN_SPEC_VERSION: Version = SPEC_VERSION_1_5_0; } /// Contains the sources that a data source uses. @@ -62,6 +72,9 @@ pub struct Source { /// Contains the transformations of source tables indexed by the Subgraph. #[derive(Debug, Clone)] pub struct Transformer { + /// The version of this transformer. + pub api_version: Version, + /// The ABIs that SQL queries can reference to extract event signatures. /// /// The `sg_event_signature('CONTRACT_NAME', 'EVENT_NAME')` calls in the diff --git a/graph/src/nozzle/manifest/data_source/raw.rs b/graph/src/nozzle/manifest/data_source/raw.rs index 24a77942812..ac3323d9b4d 100644 --- a/graph/src/nozzle/manifest/data_source/raw.rs +++ b/graph/src/nozzle/manifest/data_source/raw.rs @@ -44,6 +44,9 @@ pub struct RawDataSource { /// Must be equal to `nozzle`. pub kind: String, + /// The network name of the data source. + pub network: String, + /// Contains sources used by this data source. pub source: RawSource, @@ -62,6 +65,7 @@ impl RawDataSource { let Self { name, kind, + network, source, transformer, } = self; @@ -80,6 +84,7 @@ impl RawDataSource { Ok(DataSource { name, + network, source, transformer, }) @@ -222,13 +227,17 @@ impl RawTransformer { abis, tables, } = self; - let _api_version = Self::resolve_api_version(api_version)?; + let api_version = Self::resolve_api_version(api_version)?; let abis = Self::resolve_abis(logger, link_resolver, abis).await?; let tables = Self::resolve_tables(logger, link_resolver, nozzle_client, tables, source, &abis) .await?; - Ok(Transformer { abis, tables }) + Ok(Transformer { + api_version, + abis, + tables, + }) } fn resolve_api_version(api_version: Version) -> Result { @@ -482,7 +491,10 @@ impl RawTable { let check_required_column = |c: &[&str], kind: &str| { if !c.iter().any(|&c| schema.column_with_name(c).is_some()) { - return Err(Error::InvalidQuery(anyhow!("query must return {kind}"))); + return Err(Error::InvalidQuery(anyhow!( + "query must return {kind}; expected column names are: {}", + c.join(", ") + ))); } Ok(()) }; From 19cf6ddc6cf6d3a03223bc5cc376f0914db82ad1 Mon Sep 17 00:00:00 2001 From: Ion Suman <47307091+isum@users.noreply.github.com> Date: Tue, 28 Oct 2025 17:19:39 +0200 Subject: [PATCH 17/40] feat(core, graph, node): add Nozzle subgraph deployment --- chain/ethereum/src/runtime/runtime_adapter.rs | 1 + core/src/subgraph/context/instance/mod.rs | 1 + core/src/subgraph/instance_manager.rs | 14 +- core/src/subgraph/registrar.rs | 54 +++--- graph/src/blockchain/mod.rs | 1 + graph/src/data/subgraph/mod.rs | 176 +++++++++++++---- graph/src/data_source/mod.rs | 178 +++++++++++++----- graph/src/data_source/subgraph.rs | 36 +++- node/src/bin/manager.rs | 10 + node/src/launcher.rs | 33 +++- node/src/manager/commands/run.rs | 20 ++ node/src/opt.rs | 8 + runtime/wasm/src/host.rs | 2 + server/index-node/src/resolver.rs | 27 ++- server/index-node/src/server.rs | 11 +- server/index-node/src/service.rs | 10 +- .../tests/chain/ethereum/manifest.rs | 31 ++- tests/src/fixture/mod.rs | 15 +- 18 files changed, 489 insertions(+), 139 deletions(-) diff --git a/chain/ethereum/src/runtime/runtime_adapter.rs b/chain/ethereum/src/runtime/runtime_adapter.rs index 8b11ada37cc..3a6103fc177 100644 --- a/chain/ethereum/src/runtime/runtime_adapter.rs +++ b/chain/ethereum/src/runtime/runtime_adapter.rs @@ -182,6 +182,7 @@ impl blockchain::RuntimeAdapter for RuntimeAdapter { create_host_fns(abis, archive, call_cache, eth_adapters, eth_call_gas) } data_source::DataSource::Offchain(_) => vec![], + data_source::DataSource::Nozzle(_) => vec![], }; Ok(host_fns) diff --git a/core/src/subgraph/context/instance/mod.rs b/core/src/subgraph/context/instance/mod.rs index 86b64195493..ade6981a6ee 100644 --- a/core/src/subgraph/context/instance/mod.rs +++ b/core/src/subgraph/context/instance/mod.rs @@ -182,6 +182,7 @@ where Ok(Some(host)) } } + DataSource::Nozzle(_) => unreachable!(), } } diff --git a/core/src/subgraph/instance_manager.rs b/core/src/subgraph/instance_manager.rs index 81c1a3ccd1a..c2b0dfaf468 100644 --- a/core/src/subgraph/instance_manager.rs +++ b/core/src/subgraph/instance_manager.rs @@ -20,6 +20,7 @@ use graph::data::subgraph::{UnresolvedSubgraphManifest, SPEC_VERSION_0_0_6}; use graph::data::value::Word; use graph::data_source::causality_region::CausalityRegionSeq; use graph::env::EnvVars; +use graph::nozzle; use graph::prelude::{SubgraphInstanceManager as SubgraphInstanceManagerTrait, *}; use graph::{blockchain::BlockchainMap, components::store::DeploymentLocator}; use graph_runtime_wasm::module::ToAscPtr; @@ -31,7 +32,7 @@ use super::SubgraphTriggerProcessor; use crate::subgraph::runner::SubgraphRunnerError; #[derive(Clone)] -pub struct SubgraphInstanceManager { +pub struct SubgraphInstanceManager { logger_factory: LoggerFactory, subgraph_store: Arc, chains: Arc, @@ -40,6 +41,7 @@ pub struct SubgraphInstanceManager { link_resolver: Arc, ipfs_service: IpfsService, arweave_service: ArweaveService, + nozzle_client: Option>, static_filters: bool, env_vars: Arc, @@ -57,7 +59,10 @@ pub struct SubgraphInstanceManager { } #[async_trait] -impl SubgraphInstanceManagerTrait for SubgraphInstanceManager { +impl SubgraphInstanceManagerTrait for SubgraphInstanceManager +where + NC: nozzle::Client + Send + Sync + 'static, +{ async fn start_subgraph( self: Arc, loc: DeploymentLocator, @@ -184,7 +189,7 @@ impl SubgraphInstanceManagerTrait for SubgraphInstanceManager< } } -impl SubgraphInstanceManager { +impl SubgraphInstanceManager { pub fn new( logger_factory: &LoggerFactory, env_vars: Arc, @@ -195,6 +200,7 @@ impl SubgraphInstanceManager { link_resolver: Arc, ipfs_service: IpfsService, arweave_service: ArweaveService, + nozzle_client: Option>, static_filters: bool, ) -> Self { let logger = logger_factory.component_logger("SubgraphInstanceManager", None); @@ -208,6 +214,7 @@ impl SubgraphInstanceManager { instances: SubgraphKeepAlive::new(sg_metrics), link_resolver, ipfs_service, + nozzle_client, static_filters, env_vars, arweave_service, @@ -325,6 +332,7 @@ impl SubgraphInstanceManager { .resolve( &deployment.hash, &link_resolver, + self.nozzle_client.cheap_clone(), &logger, ENV_VARS.max_spec_version.clone(), ) diff --git a/core/src/subgraph/registrar.rs b/core/src/subgraph/registrar.rs index 2f468edda30..c67c3dbf293 100644 --- a/core/src/subgraph/registrar.rs +++ b/core/src/subgraph/registrar.rs @@ -1,31 +1,30 @@ use std::collections::HashSet; use async_trait::async_trait; -use graph::blockchain::Blockchain; -use graph::blockchain::BlockchainKind; -use graph::blockchain::BlockchainMap; -use graph::components::link_resolver::LinkResolverContext; -use graph::components::store::{DeploymentId, DeploymentLocator, SubscriptionManager}; -use graph::components::subgraph::Settings; -use graph::data::subgraph::schema::DeploymentCreate; -use graph::data::subgraph::Graft; -use graph::data::value::Word; -use graph::futures03; -use graph::futures03::future::TryFutureExt; -use graph::futures03::Stream; -use graph::futures03::StreamExt; +use graph::blockchain::{Blockchain, BlockchainKind, BlockchainMap}; +use graph::components::{ + link_resolver::LinkResolverContext, + store::{DeploymentId, DeploymentLocator, SubscriptionManager}, + subgraph::Settings, +}; +use graph::data::{ + subgraph::{schema::DeploymentCreate, Graft}, + value::Word, +}; +use graph::futures03::{self, future::TryFutureExt, Stream, StreamExt}; +use graph::nozzle; use graph::prelude::{CreateSubgraphResult, SubgraphRegistrar as SubgraphRegistrarTrait, *}; use graph::tokio_retry::Retry; -use graph::util::futures::retry_strategy; -use graph::util::futures::RETRY_DEFAULT_LIMIT; +use graph::util::futures::{retry_strategy, RETRY_DEFAULT_LIMIT}; -pub struct SubgraphRegistrar { +pub struct SubgraphRegistrar { logger: Logger, logger_factory: LoggerFactory, resolver: Arc, provider: Arc

, store: Arc, subscription_manager: Arc, + nozzle_client: Option>, chains: Arc, node_id: NodeId, version_switching_mode: SubgraphVersionSwitchingMode, @@ -33,11 +32,12 @@ pub struct SubgraphRegistrar { settings: Arc, } -impl SubgraphRegistrar +impl SubgraphRegistrar where P: graph::components::subgraph::SubgraphInstanceManager, S: SubgraphStore, SM: SubscriptionManager, + NC: nozzle::Client + Send + Sync + 'static, { pub fn new( logger_factory: &LoggerFactory, @@ -45,6 +45,7 @@ where provider: Arc

, store: Arc, subscription_manager: Arc, + nozzle_client: Option>, chains: Arc, node_id: NodeId, version_switching_mode: SubgraphVersionSwitchingMode, @@ -62,6 +63,7 @@ where provider, store, subscription_manager, + nozzle_client, chains, node_id, version_switching_mode, @@ -220,11 +222,12 @@ where } #[async_trait] -impl SubgraphRegistrarTrait for SubgraphRegistrar +impl SubgraphRegistrarTrait for SubgraphRegistrar where P: graph::components::subgraph::SubgraphInstanceManager, S: SubgraphStore, SM: SubscriptionManager, + NC: nozzle::Client + Send + Sync + 'static, { async fn create_subgraph( &self, @@ -296,7 +299,7 @@ where let deployment_locator = match kind { BlockchainKind::Ethereum => { - create_subgraph_version::( + create_subgraph_version::( &logger, self.store.clone(), self.chains.cheap_clone(), @@ -309,12 +312,13 @@ where debug_fork, self.version_switching_mode, &resolver, + self.nozzle_client.cheap_clone(), history_blocks, ) .await? } BlockchainKind::Near => { - create_subgraph_version::( + create_subgraph_version::( &logger, self.store.clone(), self.chains.cheap_clone(), @@ -327,12 +331,13 @@ where debug_fork, self.version_switching_mode, &resolver, + self.nozzle_client.cheap_clone(), history_blocks, ) .await? } BlockchainKind::Substreams => { - create_subgraph_version::( + create_subgraph_version::( &logger, self.store.clone(), self.chains.cheap_clone(), @@ -345,6 +350,7 @@ where debug_fork, self.version_switching_mode, &resolver, + self.nozzle_client.cheap_clone(), history_blocks, ) .await? @@ -458,7 +464,7 @@ async fn resolve_graft_block( }) } -async fn create_subgraph_version( +async fn create_subgraph_version( logger: &Logger, store: Arc, chains: Arc, @@ -471,6 +477,7 @@ async fn create_subgraph_version( debug_fork: Option, version_switching_mode: SubgraphVersionSwitchingMode, resolver: &Arc, + nozzle_client: Option>, history_blocks_override: Option, ) -> Result { let raw_string = serde_yaml::to_string(&raw).unwrap(); @@ -478,7 +485,8 @@ async fn create_subgraph_version( let unvalidated = UnvalidatedSubgraphManifest::::resolve( deployment.clone(), raw, - &resolver, + resolver, + nozzle_client, logger, ENV_VARS.max_spec_version.clone(), ) diff --git a/graph/src/blockchain/mod.rs b/graph/src/blockchain/mod.rs index 7768ea7f6e9..cd22acb8a69 100644 --- a/graph/src/blockchain/mod.rs +++ b/graph/src/blockchain/mod.rs @@ -595,6 +595,7 @@ impl FromStr for BlockchainKind { "near" => Ok(BlockchainKind::Near), "substreams" => Ok(BlockchainKind::Substreams), "subgraph" => Ok(BlockchainKind::Ethereum), // TODO(krishna): We should detect the blockchain kind from the source subgraph + "nozzle" => Ok(BlockchainKind::Ethereum), // TODO: Maybe get this from the Nozzle server _ => Err(anyhow!("unknown blockchain kind {}", s)), } } diff --git a/graph/src/data/subgraph/mod.rs b/graph/src/data/subgraph/mod.rs index 25287a94e95..3cb52497b0e 100644 --- a/graph/src/data/subgraph/mod.rs +++ b/graph/src/data/subgraph/mod.rs @@ -12,7 +12,7 @@ pub use features::{SubgraphFeature, SubgraphFeatureValidationError}; use crate::{cheap_clone::CheapClone, components::store::BLOCK_NUMBER_MAX, object}; use anyhow::{anyhow, Context, Error}; -use futures03::{future::try_join, stream::FuturesOrdered, TryStreamExt as _}; +use futures03::future::try_join_all; use itertools::Itertools; use semver::Version; use serde::{ @@ -47,7 +47,7 @@ use crate::{ UnresolvedDataSourceTemplate, }, derive::CacheWeight, - ensure, + ensure, nozzle, prelude::{r, Value, ENV_VARS}, schema::{InputSchema, SchemaValidationError}, }; @@ -363,6 +363,8 @@ pub enum SubgraphManifestValidationError { FeatureValidationError(#[from] SubgraphFeatureValidationError), #[error("data source {0} is invalid: {1}")] DataSourceValidation(String, Error), + #[error("failed to validate Nozzle subgraph: {0:#}")] + Nozzle(#[source] Error), } #[derive(Error, Debug)] @@ -719,7 +721,7 @@ impl<'de> de::Deserialize<'de> for Prune { /// SubgraphManifest with IPFS links unresolved pub type UnresolvedSubgraphManifest = BaseSubgraphManifest< C, - UnresolvedSchema, + Option, UnresolvedDataSource, UnresolvedDataSourceTemplate, >; @@ -802,15 +804,24 @@ impl UnvalidatedSubgraphManifest { /// Entry point for resolving a subgraph definition. /// Right now the only supported links are of the form: /// `/ipfs/QmUmg7BZC1YP1ca66rRtWKxpXp77WgVHrnv263JtDuvs2k` - pub async fn resolve( + pub async fn resolve( id: DeploymentHash, raw: serde_yaml::Mapping, resolver: &Arc, + nozzle_client: Option>, logger: &Logger, max_spec_version: semver::Version, ) -> Result { Ok(Self( - SubgraphManifest::resolve_from_raw(id, raw, resolver, logger, max_spec_version).await?, + SubgraphManifest::resolve_from_raw( + id, + raw, + resolver, + nozzle_client, + logger, + max_spec_version, + ) + .await?, )) } @@ -875,6 +886,8 @@ impl UnvalidatedSubgraphManifest { &self.0.spec_version, )); + errors.append(&mut self.validate_nozzle_subgraph()); + match errors.is_empty() { true => Ok(self.0), false => Err(errors), @@ -884,20 +897,77 @@ impl UnvalidatedSubgraphManifest { pub fn spec_version(&self) -> &Version { &self.0.spec_version } + + fn validate_nozzle_subgraph(&self) -> Vec { + use api_version::SPEC_VERSION_1_4_0; + + let BaseSubgraphManifest { + id: _, + spec_version, + features, + description: _, + repository: _, + schema: _, + data_sources, + graft, + templates, + chain: _, + indexer_hints: _, + } = &self.0; + + let nozzle_data_sources = data_sources + .iter() + .filter_map(|data_source| match data_source { + DataSource::Nozzle(nozzle_data_source) => Some(nozzle_data_source), + _ => None, + }) + .collect_vec(); + + if nozzle_data_sources.is_empty() { + // Not a Nozzle subgraph + return Vec::new(); + } + + let mut errors = Vec::new(); + let err = |msg: &str| SubgraphManifestValidationError::Nozzle(anyhow!(msg.to_owned())); + + if data_sources.len() != nozzle_data_sources.len() { + errors.push(err("multiple data source kinds are not supported")); + } + + if *spec_version < SPEC_VERSION_1_4_0 { + errors.push(err("spec version is not supported")); + } + + if !features.is_empty() { + errors.push(err("manifest features are not supported")); + } + + if graft.is_some() { + errors.push(err("grafting is not supported")); + } + + if !templates.is_empty() { + errors.push(err("data source templates are not supported")); + } + + errors + } } impl SubgraphManifest { /// Entry point for resolving a subgraph definition. - pub async fn resolve_from_raw( + pub async fn resolve_from_raw( id: DeploymentHash, raw: serde_yaml::Mapping, resolver: &Arc, + nozzle_client: Option>, logger: &Logger, max_spec_version: semver::Version, ) -> Result { let unresolved = UnresolvedSubgraphManifest::parse(id.cheap_clone(), raw)?; let resolved = unresolved - .resolve(&id, resolver, logger, max_spec_version) + .resolve(&id, resolver, nozzle_client, logger, max_spec_version) .await?; Ok(resolved) } @@ -1033,10 +1103,11 @@ impl UnresolvedSubgraphManifest { serde_yaml::from_value(raw.into()).map_err(Into::into) } - pub async fn resolve( + pub async fn resolve( self, deployment_hash: &DeploymentHash, resolver: &Arc, + nozzle_client: Option>, logger: &Logger, max_spec_version: semver::Version, ) -> Result, SubgraphManifestResolveError> { @@ -1046,7 +1117,7 @@ impl UnresolvedSubgraphManifest { features, description, repository, - schema, + schema: unresolved_schema, data_sources, graft, templates, @@ -1064,46 +1135,77 @@ impl UnresolvedSubgraphManifest { ).into()); } - let ds_count = data_sources.len(); - if ds_count as u64 + templates.len() as u64 > u32::MAX as u64 { + if data_sources.len() + templates.len() > u32::MAX as usize { return Err( - anyhow!("Subgraph has too many declared data sources and templates",).into(), + anyhow!("subgraph has too many declared data sources and templates").into(), ); } - let schema = schema - .resolve(&id, &spec_version, id.clone(), resolver, logger) - .await?; + let data_sources = try_join_all(data_sources.into_iter().enumerate().map(|(idx, ds)| { + ds.resolve( + deployment_hash, + resolver, + nozzle_client.cheap_clone(), + logger, + idx as u32, + &spec_version, + ) + })) + .await?; - let (data_sources, templates) = try_join( - data_sources - .into_iter() - .enumerate() - .map(|(idx, ds)| { - ds.resolve(deployment_hash, resolver, logger, idx as u32, &spec_version) - }) - .collect::>() - .try_collect::>(), - templates - .into_iter() - .enumerate() - .map(|(idx, template)| { - template.resolve( + let nozzle_data_sources = data_sources + .iter() + .filter_map(|data_source| match data_source { + DataSource::Nozzle(nozzle_data_source) => Some(nozzle_data_source), + _ => None, + }) + .collect_vec(); + + let schema = match unresolved_schema { + Some(unresolved_schema) => { + unresolved_schema + .resolve( deployment_hash, + &spec_version, + id.cheap_clone(), resolver, - &schema, logger, - ds_count as u32 + idx as u32, - &spec_version, ) - }) - .collect::>() - .try_collect::>(), - ) + .await? + } + None if nozzle_data_sources.len() == data_sources.len() => { + let table_schemas = nozzle_data_sources + .iter() + .map(|data_source| { + data_source + .transformer + .tables + .iter() + .map(|table| (table.name.cheap_clone(), table.schema.clone())) + }) + .flatten(); + + nozzle::schema::generate_subgraph_schema(&id, table_schemas)? + } + None => { + return Err(anyhow!("subgraph schema is required").into()); + } + }; + + let templates = try_join_all(templates.into_iter().enumerate().map(|(idx, template)| { + template.resolve( + &id, + resolver, + &schema, + logger, + data_sources.len() as u32 + idx as u32, + &spec_version, + ) + })) .await?; let is_substreams = data_sources.iter().any(|ds| ds.kind() == SUBSTREAMS_KIND); - if is_substreams && ds_count > 1 { + if is_substreams && data_sources.len() > 1 { return Err(anyhow!( "A Substreams-based subgraph can only contain a single data source." ) diff --git a/graph/src/data_source/mod.rs b/graph/src/data_source/mod.rs index e7fc22228ea..4c1addbc2a6 100644 --- a/graph/src/data_source/mod.rs +++ b/graph/src/data_source/mod.rs @@ -25,7 +25,7 @@ use crate::{ prelude::{CheapClone as _, DataSourceContext}, schema::{EntityType, InputSchema}, }; -use anyhow::Error; +use anyhow::{anyhow, Context, Error}; use semver::Version; use serde::{de::IntoDeserializer as _, Deserialize, Deserializer}; use slog::{Logger, SendSyncRefUnwindSafeKV}; @@ -36,11 +36,14 @@ use std::{ }; use thiserror::Error; +use crate::nozzle; + #[derive(Debug)] pub enum DataSource { Onchain(C::DataSource), Offchain(offchain::DataSource), Subgraph(subgraph::DataSource), + Nozzle(nozzle::manifest::DataSource), } #[derive(Error, Debug)] @@ -96,6 +99,7 @@ impl DataSource { Self::Onchain(ds) => Some(ds), Self::Offchain(_) => None, Self::Subgraph(_) => None, + Self::Nozzle(_) => None, } } @@ -104,6 +108,7 @@ impl DataSource { Self::Onchain(_) => None, Self::Offchain(_) => None, Self::Subgraph(ds) => Some(ds), + Self::Nozzle(_) => None, } } @@ -112,6 +117,7 @@ impl DataSource { Self::Onchain(_) => true, Self::Offchain(_) => false, Self::Subgraph(_) => true, + Self::Nozzle(_) => true, } } @@ -120,6 +126,7 @@ impl DataSource { Self::Onchain(_) => None, Self::Offchain(ds) => Some(ds), Self::Subgraph(_) => None, + Self::Nozzle(_) => None, } } @@ -128,6 +135,7 @@ impl DataSource { DataSourceEnum::Onchain(ds) => ds.network(), DataSourceEnum::Offchain(_) => None, DataSourceEnum::Subgraph(ds) => ds.network(), + Self::Nozzle(ds) => Some(&ds.network), } } @@ -136,6 +144,7 @@ impl DataSource { DataSourceEnum::Onchain(ds) => Some(ds.start_block()), DataSourceEnum::Offchain(_) => None, DataSourceEnum::Subgraph(ds) => Some(ds.source.start_block), + Self::Nozzle(ds) => Some(ds.source.start_block as i32), } } @@ -152,6 +161,7 @@ impl DataSource { Self::Onchain(ds) => ds.address().map(ToOwned::to_owned), Self::Offchain(ds) => ds.address(), Self::Subgraph(ds) => ds.address(), + Self::Nozzle(ds) => Some(ds.source.address.to_vec()), } } @@ -160,6 +170,7 @@ impl DataSource { Self::Onchain(ds) => ds.name(), Self::Offchain(ds) => &ds.name, Self::Subgraph(ds) => &ds.name, + Self::Nozzle(ds) => ds.name.as_str(), } } @@ -168,6 +179,7 @@ impl DataSource { Self::Onchain(ds) => ds.kind().to_owned(), Self::Offchain(ds) => ds.kind.to_string(), Self::Subgraph(ds) => ds.kind.clone(), + Self::Nozzle(_) => nozzle::manifest::DataSource::KIND.to_string(), } } @@ -176,6 +188,7 @@ impl DataSource { Self::Onchain(ds) => ds.min_spec_version(), Self::Offchain(ds) => ds.min_spec_version(), Self::Subgraph(ds) => ds.min_spec_version(), + Self::Nozzle(_) => nozzle::manifest::DataSource::MIN_SPEC_VERSION, } } @@ -184,6 +197,7 @@ impl DataSource { Self::Onchain(ds) => ds.end_block(), Self::Offchain(_) => None, Self::Subgraph(_) => None, + Self::Nozzle(ds) => Some(ds.source.end_block as i32), } } @@ -192,6 +206,7 @@ impl DataSource { Self::Onchain(ds) => ds.creation_block(), Self::Offchain(ds) => ds.creation_block, Self::Subgraph(ds) => ds.creation_block, + Self::Nozzle(_) => None, } } @@ -200,6 +215,7 @@ impl DataSource { Self::Onchain(ds) => ds.context(), Self::Offchain(ds) => ds.context.clone(), Self::Subgraph(ds) => ds.context.clone(), + Self::Nozzle(_) => Arc::new(None), } } @@ -208,6 +224,7 @@ impl DataSource { Self::Onchain(ds) => ds.api_version(), Self::Offchain(ds) => ds.mapping.api_version.clone(), Self::Subgraph(ds) => ds.mapping.api_version.clone(), + Self::Nozzle(ds) => ds.transformer.api_version.clone(), } } @@ -216,6 +233,7 @@ impl DataSource { Self::Onchain(ds) => ds.runtime(), Self::Offchain(ds) => Some(ds.mapping.runtime.cheap_clone()), Self::Subgraph(ds) => Some(ds.mapping.runtime.cheap_clone()), + Self::Nozzle(_) => None, } } @@ -226,6 +244,7 @@ impl DataSource { Self::Onchain(_) => EntityTypeAccess::Any, Self::Offchain(ds) => EntityTypeAccess::Restriced(ds.mapping.entities.clone()), Self::Subgraph(_) => EntityTypeAccess::Any, + Self::Nozzle(_) => EntityTypeAccess::Any, } } @@ -234,6 +253,7 @@ impl DataSource { Self::Onchain(ds) => ds.handler_kinds(), Self::Offchain(ds) => vec![ds.handler_kind()].into_iter().collect(), Self::Subgraph(ds) => vec![ds.handler_kind()].into_iter().collect(), + Self::Nozzle(_) => HashSet::new(), } } @@ -242,6 +262,7 @@ impl DataSource { Self::Onchain(ds) => ds.has_declared_calls(), Self::Offchain(_) => false, Self::Subgraph(_) => false, + Self::Nozzle(_) => false, } } @@ -268,6 +289,7 @@ impl DataSource { | (Self::Offchain(_), TriggerData::Subgraph(_)) | (Self::Subgraph(_), TriggerData::Onchain(_)) | (Self::Subgraph(_), TriggerData::Offchain(_)) => Ok(None), + (Self::Nozzle(_), _) => Ok(None), } } @@ -284,6 +306,7 @@ impl DataSource { Self::Onchain(ds) => ds.as_stored_dynamic_data_source(), Self::Offchain(ds) => ds.as_stored_dynamic_data_source(), Self::Subgraph(_) => todo!(), // TODO(krishna) + Self::Nozzle(_) => unreachable!(), } } @@ -309,6 +332,7 @@ impl DataSource { Self::Onchain(ds) => ds.validate(spec_version), Self::Offchain(_) => vec![], Self::Subgraph(_) => vec![], // TODO(krishna) + Self::Nozzle(_) => Vec::new(), } } @@ -317,6 +341,7 @@ impl DataSource { Self::Onchain(_) => CausalityRegion::ONCHAIN, Self::Offchain(ds) => ds.causality_region, Self::Subgraph(_) => CausalityRegion::ONCHAIN, + Self::Nozzle(_) => CausalityRegion::ONCHAIN, } } } @@ -326,13 +351,15 @@ pub enum UnresolvedDataSource { Onchain(C::UnresolvedDataSource), Offchain(offchain::UnresolvedDataSource), Subgraph(subgraph::UnresolvedDataSource), + Nozzle(nozzle::manifest::data_source::RawDataSource), } impl UnresolvedDataSource { - pub async fn resolve( + pub async fn resolve( self, deployment_hash: &DeploymentHash, resolver: &Arc, + nozzle_client: Option>, logger: &Logger, manifest_idx: u32, spec_version: &semver::Version, @@ -349,9 +376,10 @@ impl UnresolvedDataSource { .await .map(DataSource::Onchain), Self::Subgraph(unresolved) => unresolved - .resolve::( + .resolve::( deployment_hash, resolver, + nozzle_client, logger, manifest_idx, spec_version, @@ -364,7 +392,16 @@ impl UnresolvedDataSource { for details see https://github.com/graphprotocol/graph-node/issues/3864" ); } + Self::Nozzle(raw_data_source) => match nozzle_client { + Some(nozzle_client) => raw_data_source + .resolve(logger, resolver.as_ref(), nozzle_client.as_ref()) + .await + .map(DataSource::Nozzle) + .map_err(Error::from), + None => Err(anyhow!("support for Nozzle data sources is not enabled")), + }, } + .with_context(|| format!("failed to resolve data source at index {manifest_idx}")) } } @@ -624,58 +661,95 @@ impl MappingTrigger { } } -macro_rules! clone_data_source { - ($t:ident) => { - impl Clone for $t { - fn clone(&self) -> Self { - match self { - Self::Onchain(ds) => Self::Onchain(ds.clone()), - Self::Offchain(ds) => Self::Offchain(ds.clone()), - Self::Subgraph(ds) => Self::Subgraph(ds.clone()), - } - } +impl Clone for DataSource { + fn clone(&self) -> Self { + match self { + Self::Onchain(ds) => Self::Onchain(ds.clone()), + Self::Offchain(ds) => Self::Offchain(ds.clone()), + Self::Subgraph(ds) => Self::Subgraph(ds.clone()), + Self::Nozzle(ds) => Self::Nozzle(ds.clone()), } - }; + } } -clone_data_source!(DataSource); -clone_data_source!(DataSourceTemplate); - -macro_rules! deserialize_data_source { - ($t:ident) => { - impl<'de, C: Blockchain> Deserialize<'de> for $t { - fn deserialize(deserializer: D) -> Result - where - D: Deserializer<'de>, - { - let map: BTreeMap = BTreeMap::deserialize(deserializer)?; - let kind = map - .get("kind") - .ok_or(serde::de::Error::missing_field("kind"))? - .as_str() - .unwrap_or("?"); - if OFFCHAIN_KINDS.contains_key(&kind) { - offchain::$t::deserialize(map.into_deserializer()) - .map_err(serde::de::Error::custom) - .map($t::Offchain) - } else if SUBGRAPH_DS_KIND == kind { - subgraph::$t::deserialize(map.into_deserializer()) - .map_err(serde::de::Error::custom) - .map($t::Subgraph) - } else if (&C::KIND.to_string() == kind) || C::ALIASES.contains(&kind) { - C::$t::deserialize(map.into_deserializer()) - .map_err(serde::de::Error::custom) - .map($t::Onchain) - } else { - Err(serde::de::Error::custom(format!( - "data source has invalid `kind`; expected {}, file/ipfs", - C::KIND, - ))) - } - } +impl Clone for DataSourceTemplate { + fn clone(&self) -> Self { + match self { + Self::Onchain(ds) => Self::Onchain(ds.clone()), + Self::Offchain(ds) => Self::Offchain(ds.clone()), + Self::Subgraph(ds) => Self::Subgraph(ds.clone()), } - }; + } } -deserialize_data_source!(UnresolvedDataSource); -deserialize_data_source!(UnresolvedDataSourceTemplate); +impl<'de, C: Blockchain> Deserialize<'de> for UnresolvedDataSource { + fn deserialize(deserializer: D) -> Result + where + D: Deserializer<'de>, + { + let map: BTreeMap = BTreeMap::deserialize(deserializer)?; + + let kind = map + .get("kind") + .ok_or(serde::de::Error::missing_field("kind"))? + .as_str() + .unwrap_or("?"); + + if OFFCHAIN_KINDS.contains_key(&kind) { + offchain::UnresolvedDataSource::deserialize(map.into_deserializer()) + .map_err(serde::de::Error::custom) + .map(UnresolvedDataSource::Offchain) + } else if SUBGRAPH_DS_KIND == kind { + subgraph::UnresolvedDataSource::deserialize(map.into_deserializer()) + .map_err(serde::de::Error::custom) + .map(UnresolvedDataSource::Subgraph) + } else if nozzle::manifest::DataSource::KIND == kind { + nozzle::manifest::data_source::RawDataSource::deserialize(map.into_deserializer()) + .map(UnresolvedDataSource::Nozzle) + .map_err(serde::de::Error::custom) + } else if (&C::KIND.to_string() == kind) || C::ALIASES.contains(&kind) { + C::UnresolvedDataSource::deserialize(map.into_deserializer()) + .map_err(serde::de::Error::custom) + .map(UnresolvedDataSource::Onchain) + } else { + Err(serde::de::Error::custom(format!( + "data source has invalid `kind`; expected {}, file/ipfs", + C::KIND, + ))) + } + } +} + +impl<'de, C: Blockchain> Deserialize<'de> for UnresolvedDataSourceTemplate { + fn deserialize(deserializer: D) -> Result + where + D: Deserializer<'de>, + { + let map: BTreeMap = BTreeMap::deserialize(deserializer)?; + + let kind = map + .get("kind") + .ok_or(serde::de::Error::missing_field("kind"))? + .as_str() + .unwrap_or("?"); + + if OFFCHAIN_KINDS.contains_key(&kind) { + offchain::UnresolvedDataSourceTemplate::deserialize(map.into_deserializer()) + .map_err(serde::de::Error::custom) + .map(UnresolvedDataSourceTemplate::Offchain) + } else if SUBGRAPH_DS_KIND == kind { + subgraph::UnresolvedDataSourceTemplate::deserialize(map.into_deserializer()) + .map_err(serde::de::Error::custom) + .map(UnresolvedDataSourceTemplate::Subgraph) + } else if (&C::KIND.to_string() == kind) || C::ALIASES.contains(&kind) { + C::UnresolvedDataSourceTemplate::deserialize(map.into_deserializer()) + .map_err(serde::de::Error::custom) + .map(UnresolvedDataSourceTemplate::Onchain) + } else { + Err(serde::de::Error::custom(format!( + "data source has invalid `kind`; expected {}, file/ipfs", + C::KIND, + ))) + } + } +} diff --git a/graph/src/data_source/subgraph.rs b/graph/src/data_source/subgraph.rs index 9f20260c6de..30784690634 100644 --- a/graph/src/data_source/subgraph.rs +++ b/graph/src/data_source/subgraph.rs @@ -28,6 +28,7 @@ use super::{ }, DataSourceTemplateInfo, TriggerWithHandler, }; +use crate::nozzle; pub const SUBGRAPH_DS_KIND: &str = "subgraph"; @@ -282,10 +283,11 @@ impl UnresolvedDataSource { Ok(()) } - async fn resolve_source_manifest( + async fn resolve_source_manifest( &self, deployment_hash: &DeploymentHash, resolver: &Arc, + nozzle_client: Option>, logger: &Logger, ) -> Result>, Error> { let resolver: Arc = @@ -319,7 +321,13 @@ impl UnresolvedDataSource { let resolver: Arc = Arc::from(resolver.for_manifest(&self.source.address.to_string())?); source_manifest - .resolve(&deployment_hash, &resolver, logger, LATEST_VERSION.clone()) + .resolve( + &deployment_hash, + &resolver, + nozzle_client, + logger, + LATEST_VERSION.clone(), + ) .await .context(format!( "Failed to resolve source subgraph [{}] manifest", @@ -329,9 +337,10 @@ impl UnresolvedDataSource { } /// Recursively verifies that all grafts in the chain meet the minimum spec version requirement for a subgraph source - async fn verify_graft_chain_sourcable( + async fn verify_graft_chain_sourcable( manifest: Arc>, resolver: &Arc, + nozzle_client: Option>, logger: &Logger, graft_chain: &mut Vec, ) -> Result<(), Error> { @@ -364,13 +373,20 @@ impl UnresolvedDataSource { let graft_manifest = UnresolvedSubgraphManifest::::parse(graft.base.clone(), graft_raw) .context("Failed to parse graft base manifest")? - .resolve(&manifest.id, resolver, logger, LATEST_VERSION.clone()) + .resolve( + &manifest.id, + resolver, + nozzle_client.cheap_clone(), + logger, + LATEST_VERSION.clone(), + ) .await .context("Failed to resolve graft base manifest")?; Box::pin(Self::verify_graft_chain_sourcable( Arc::new(graft_manifest), resolver, + nozzle_client, logger, graft_chain, )) @@ -380,10 +396,12 @@ impl UnresolvedDataSource { Ok(()) } - pub(super) async fn resolve( + #[allow(dead_code)] + pub(super) async fn resolve( self, deployment_hash: &DeploymentHash, resolver: &Arc, + nozzle_client: Option>, logger: &Logger, manifest_idx: u32, spec_version: &semver::Version, @@ -396,7 +414,12 @@ impl UnresolvedDataSource { let kind = self.kind.clone(); let source_manifest = self - .resolve_source_manifest::(deployment_hash, resolver, logger) + .resolve_source_manifest::( + deployment_hash, + resolver, + nozzle_client.cheap_clone(), + logger, + ) .await?; let source_spec_version = &source_manifest.spec_version; if source_spec_version < &SPEC_VERSION_1_3_0 { @@ -413,6 +436,7 @@ impl UnresolvedDataSource { Self::verify_graft_chain_sourcable( source_manifest.clone(), resolver, + nozzle_client, logger, &mut graft_chain, ) diff --git a/node/src/bin/manager.rs b/node/src/bin/manager.rs index 9e67a532a8c..a6d881a8747 100644 --- a/node/src/bin/manager.rs +++ b/node/src/bin/manager.rs @@ -105,6 +105,15 @@ pub struct Opt { pub fork_base: Option, #[clap(long, help = "version label, used for prometheus metrics")] pub version_label: Option, + + #[clap( + long, + value_name = "{HOST:PORT|URL}", + env = "GRAPH_NOZZLE_FLIGHT_SERVICE_ADDRESS", + help = "The address of the Nozzle Flight gRPC service" + )] + pub nozzle_flight_service_address: Option, + #[clap(subcommand)] pub cmd: Command, } @@ -1331,6 +1340,7 @@ async fn main() -> anyhow::Result<()> { network_name, ipfs_url, arweave_url, + opt.nozzle_flight_service_address.clone(), config, metrics_ctx, node_id, diff --git a/node/src/launcher.rs b/node/src/launcher.rs index 410ce38293a..b8b27a31a1a 100644 --- a/node/src/launcher.rs +++ b/node/src/launcher.rs @@ -6,7 +6,6 @@ use std::{ use anyhow::Result; use git_testament::{git_testament, render_testament}; -use graph::blockchain::{Blockchain, BlockchainKind, BlockchainMap}; use graph::components::link_resolver::{ArweaveClient, FileSizeLimit}; use graph::components::subgraph::Settings; use graph::data::graphql::load_manager::LoadManager; @@ -16,6 +15,10 @@ use graph::futures03::future::TryFutureExt; use graph::prelude::*; use graph::prometheus::Registry; use graph::url::Url; +use graph::{ + blockchain::{Blockchain, BlockchainKind, BlockchainMap}, + nozzle, +}; use graph_core::polling_monitor::{arweave_service, ArweaveService, IpfsService}; use graph_graphql::prelude::GraphQlRunner; use graph_server_http::GraphQLServer as GraphQLQueryServer; @@ -254,7 +257,7 @@ fn deploy_subgraph_from_flag( ); } -fn build_subgraph_registrar( +fn build_subgraph_registrar( metrics_registry: Arc, network_store: &Arc, logger_factory: &LoggerFactory, @@ -266,13 +269,18 @@ fn build_subgraph_registrar( subscription_manager: Arc, arweave_service: ArweaveService, ipfs_service: IpfsService, + nozzle_client: Option>, ) -> Arc< graph_core::subgraph::SubgraphRegistrar< graph_core::subgraph_provider::SubgraphProvider, SubgraphStore, SubscriptionManager, + NC, >, -> { +> +where + NC: nozzle::Client + Send + Sync + 'static, +{ let static_filters = ENV_VARS.experimental_static_filters; let sg_count = Arc::new(SubgraphCountMetric::new(metrics_registry.cheap_clone())); @@ -286,6 +294,7 @@ fn build_subgraph_registrar( link_resolver.clone(), ipfs_service, arweave_service, + nozzle_client.cheap_clone(), static_filters, ); @@ -315,6 +324,7 @@ fn build_subgraph_registrar( Arc::new(subgraph_provider), network_store.subgraph_store(), subscription_manager, + nozzle_client, blockchain_map, node_id.clone(), version_switching_mode, @@ -469,6 +479,21 @@ pub async fn run( &logger_factory, ); + let nozzle_client = match opt.nozzle_flight_service_address.as_deref() { + Some(nozzle_flight_service_address) => { + let addr = nozzle_flight_service_address + .parse() + .expect("Invalid Nozzle Flight service address"); + + let nozzle_client = nozzle::FlightClient::new(addr) + .await + .expect("Failed to connect to Nozzle Flight service"); + + Some(Arc::new(nozzle_client)) + } + None => None, + }; + start_graphman_server(opt.graphman_port, graphman_server_config).await; let launch_services = |logger: Logger, env_vars: Arc| async move { @@ -503,6 +528,7 @@ pub async fn run( blockchain_map.clone(), network_store.clone(), link_resolver.clone(), + nozzle_client.cheap_clone(), ); if !opt.disable_block_ingestor { @@ -528,6 +554,7 @@ pub async fn run( subscription_manager, arweave_service, ipfs_service, + nozzle_client, ); graph::spawn( diff --git a/node/src/manager/commands/run.rs b/node/src/manager/commands/run.rs index bf4ade053e5..e27f94d16d4 100644 --- a/node/src/manager/commands/run.rs +++ b/node/src/manager/commands/run.rs @@ -15,6 +15,7 @@ use graph::components::store::DeploymentLocator; use graph::components::subgraph::{Settings, SubgraphInstanceManager as _}; use graph::endpoint::EndpointMetrics; use graph::env::EnvVars; +use graph::nozzle; use graph::prelude::{ anyhow, tokio, BlockNumber, DeploymentHash, IpfsResolver, LoggerFactory, NodeId, SubgraphCountMetric, SubgraphName, SubgraphRegistrar, SubgraphStore, @@ -38,6 +39,7 @@ pub async fn run( _network_name: String, ipfs_url: Vec, arweave_url: String, + nozzle_flight_service_address: Option, config: Config, metrics_ctx: MetricsContext, node_id: NodeId, @@ -136,6 +138,22 @@ pub async fn run( let static_filters = ENV_VARS.experimental_static_filters; let sg_metrics = Arc::new(SubgraphCountMetric::new(metrics_registry.clone())); + + let nozzle_client = match nozzle_flight_service_address { + Some(nozzle_flight_service_address) => { + let addr = nozzle_flight_service_address + .parse() + .expect("Invalid Nozzle Flight service address"); + + let nozzle_client = nozzle::FlightClient::new(addr) + .await + .expect("Failed to connect to Nozzle Flight service"); + + Some(Arc::new(nozzle_client)) + } + None => None, + }; + let subgraph_instance_manager = graph_core::subgraph::SubgraphInstanceManager::new( &logger_factory, env_vars.cheap_clone(), @@ -146,6 +164,7 @@ pub async fn run( link_resolver.cheap_clone(), ipfs_service, arweave_service, + nozzle_client.cheap_clone(), static_filters, ); @@ -173,6 +192,7 @@ pub async fn run( subgraph_provider.cheap_clone(), subgraph_store.clone(), panicking_subscription_manager, + nozzle_client, blockchain_map, node_id.clone(), SubgraphVersionSwitchingMode::Instant, diff --git a/node/src/opt.rs b/node/src/opt.rs index 9928144396a..f906de72ac0 100644 --- a/node/src/opt.rs +++ b/node/src/opt.rs @@ -230,6 +230,14 @@ pub struct Opt { help = "Port for the graphman GraphQL server" )] pub graphman_port: u16, + + #[clap( + long, + value_name = "{HOST:PORT|URL}", + env = "GRAPH_NOZZLE_FLIGHT_SERVICE_ADDRESS", + help = "The address of the Nozzle Flight gRPC service" + )] + pub nozzle_flight_service_address: Option, } impl From for config::Opt { diff --git a/runtime/wasm/src/host.rs b/runtime/wasm/src/host.rs index bc5610a63d0..f67ea8d5ee7 100644 --- a/runtime/wasm/src/host.rs +++ b/runtime/wasm/src/host.rs @@ -363,6 +363,7 @@ impl RuntimeHostTrait for RuntimeHost { DataSource::Onchain(_) => None, DataSource::Offchain(ds) => ds.done_at(), DataSource::Subgraph(_) => None, + DataSource::Nozzle(_) => None, } } @@ -371,6 +372,7 @@ impl RuntimeHostTrait for RuntimeHost { DataSource::Onchain(_) => {} DataSource::Offchain(ds) => ds.set_done_at(block), DataSource::Subgraph(_) => {} + DataSource::Nozzle(_) => {} } } diff --git a/server/index-node/src/resolver.rs b/server/index-node/src/resolver.rs index dbcb4cb93a0..9c59066b1c3 100644 --- a/server/index-node/src/resolver.rs +++ b/server/index-node/src/resolver.rs @@ -15,6 +15,7 @@ use graph::data::graphql::{object, IntoValue, ObjectOrInterface, ValueMap}; use graph::data::subgraph::{status, DeploymentFeatures}; use graph::data::value::Object; use graph::futures03::TryFutureExt; +use graph::nozzle; use graph::prelude::*; use graph_graphql::prelude::{a, ExecutionContext, Resolver}; @@ -95,19 +96,25 @@ impl IntoValue for PublicProofOfIndexingResult { /// Resolver for the index node GraphQL API. #[derive(Clone)] -pub struct IndexNodeResolver { +pub struct IndexNodeResolver { logger: Logger, blockchain_map: Arc, store: Arc, link_resolver: Arc, + nozzle_client: Option>, bearer_token: Option, } -impl IndexNodeResolver { +impl IndexNodeResolver +where + S: Store, + NC: nozzle::Client + Send + Sync + 'static, +{ pub fn new( logger: &Logger, store: Arc, link_resolver: Arc, + nozzle_client: Option>, bearer_token: Option, blockchain_map: Arc, ) -> Self { @@ -118,6 +125,7 @@ impl IndexNodeResolver { blockchain_map, store, link_resolver, + nozzle_client, bearer_token, } } @@ -514,6 +522,7 @@ impl IndexNodeResolver { deployment_hash.clone(), raw_yaml, &self.link_resolver, + self.nozzle_client.cheap_clone(), &self.logger, max_spec_version, ) @@ -531,6 +540,7 @@ impl IndexNodeResolver { deployment_hash.clone(), raw_yaml, &self.link_resolver, + self.nozzle_client.cheap_clone(), &self.logger, max_spec_version, ) @@ -548,6 +558,7 @@ impl IndexNodeResolver { deployment_hash.clone(), raw_yaml, &self.link_resolver, + self.nozzle_client.cheap_clone(), &self.logger, max_spec_version, ) @@ -682,7 +693,11 @@ impl IndexNodeResolver { } #[async_trait] -impl BlockPtrForNumber for IndexNodeResolver { +impl BlockPtrForNumber for IndexNodeResolver +where + S: Store, + NC: nozzle::Client + Send + Sync + 'static, +{ async fn block_ptr_for_number( &self, network: String, @@ -755,7 +770,11 @@ fn entity_changes_to_graphql(entity_changes: Vec) -> r::Value { } #[async_trait] -impl Resolver for IndexNodeResolver { +impl Resolver for IndexNodeResolver +where + S: Store, + NC: nozzle::Client + Send + Sync + 'static, +{ const CACHEABLE: bool = false; async fn query_permit(&self) -> QueryPermit { diff --git a/server/index-node/src/server.rs b/server/index-node/src/server.rs index 326d633b896..138d5e85d69 100644 --- a/server/index-node/src/server.rs +++ b/server/index-node/src/server.rs @@ -1,25 +1,29 @@ use graph::{ blockchain::BlockchainMap, + cheap_clone::CheapClone, components::{ server::server::{start, ServerHandle}, store::Store, }, + nozzle, prelude::*, }; use crate::service::IndexNodeService; /// A GraphQL server based on Hyper. -pub struct IndexNodeServer { +pub struct IndexNodeServer { logger: Logger, blockchain_map: Arc, store: Arc, link_resolver: Arc, + nozzle_client: Option>, } -impl IndexNodeServer +impl IndexNodeServer where S: Store, + NC: nozzle::Client + Send + Sync + 'static, { /// Creates a new GraphQL server. pub fn new( @@ -27,6 +31,7 @@ where blockchain_map: Arc, store: Arc, link_resolver: Arc, + nozzle_client: Option>, ) -> Self { let logger = logger_factory.component_logger( "IndexNodeServer", @@ -42,6 +47,7 @@ where blockchain_map, store, link_resolver, + nozzle_client, } } @@ -62,6 +68,7 @@ where self.blockchain_map.clone(), store, self.link_resolver.clone(), + self.nozzle_client.cheap_clone(), )); start(logger_for_service.clone(), port, move |req| { diff --git a/server/index-node/src/service.rs b/server/index-node/src/service.rs index d07d9b9e5e3..6dcf0138566 100644 --- a/server/index-node/src/service.rs +++ b/server/index-node/src/service.rs @@ -17,6 +17,7 @@ use graph::hyper::{body::Body, Method, Request, Response, StatusCode}; use graph::components::{server::query::ServerError, store::Store}; use graph::data::query::{Query, QueryError, QueryResult, QueryResults}; +use graph::nozzle; use graph::prelude::{q, serde_json}; use graph::slog::{debug, error, Logger}; use graph_graphql::prelude::{execute_query, Query as PreparedQuery, QueryExecutionOptions}; @@ -39,23 +40,26 @@ impl GraphQLMetrics for NoopGraphQLMetrics { /// A Hyper Service that serves GraphQL over a POST / endpoint. #[derive(Debug)] -pub struct IndexNodeService { +pub struct IndexNodeService { logger: Logger, blockchain_map: Arc, store: Arc, explorer: Arc>, link_resolver: Arc, + nozzle_client: Option>, } -impl IndexNodeService +impl IndexNodeService where S: Store, + NC: nozzle::Client + Send + Sync + 'static, { pub fn new( logger: Logger, blockchain_map: Arc, store: Arc, link_resolver: Arc, + nozzle_client: Option>, ) -> Self { let explorer = Arc::new(Explorer::new(store.clone())); @@ -65,6 +69,7 @@ where store, explorer, link_resolver, + nozzle_client, } } @@ -138,6 +143,7 @@ where &logger, store, self.link_resolver.clone(), + self.nozzle_client.cheap_clone(), validated.bearer_token, self.blockchain_map.clone(), ); diff --git a/store/test-store/tests/chain/ethereum/manifest.rs b/store/test-store/tests/chain/ethereum/manifest.rs index b72f70dcd78..7cd66ff48b5 100644 --- a/store/test-store/tests/chain/ethereum/manifest.rs +++ b/store/test-store/tests/chain/ethereum/manifest.rs @@ -17,6 +17,7 @@ use graph::data_source::offchain::OffchainDataSourceKind; use graph::data_source::{DataSourceEnum, DataSourceTemplate}; use graph::entity; use graph::env::ENV_VARS; +use graph::nozzle; use graph::prelude::web3::types::H256; use graph::prelude::{ anyhow, async_trait, serde_yaml, tokio, BigDecimal, BigInt, DeploymentHash, Link, @@ -138,7 +139,15 @@ async fn try_resolve_manifest( let resolver: Arc = Arc::new(resolver); let raw = serde_yaml::from_str(text)?; - Ok(SubgraphManifest::resolve_from_raw(id, raw, &resolver, &LOGGER, max_spec_version).await?) + Ok(SubgraphManifest::resolve_from_raw( + id, + raw, + &resolver, + Option::>::None, + &LOGGER, + max_spec_version, + ) + .await?) } async fn resolve_manifest( @@ -160,9 +169,16 @@ async fn resolve_unvalidated(text: &str) -> UnvalidatedSubgraphManifest { let resolver: Arc = Arc::new(resolver); let raw = serde_yaml::from_str(text).unwrap(); - UnvalidatedSubgraphManifest::resolve(id, raw, &resolver, &LOGGER, SPEC_VERSION_0_0_4.clone()) - .await - .expect("Parsing simple manifest works") + UnvalidatedSubgraphManifest::resolve( + id, + raw, + &resolver, + Option::>::None, + &LOGGER, + SPEC_VERSION_0_0_4.clone(), + ) + .await + .expect("Parsing simple manifest works") } // Some of these manifest tests should be made chain-independent, but for @@ -1313,6 +1329,7 @@ schema: id, raw, &resolver, + Option::>::None, &LOGGER, SPEC_VERSION_0_0_4.clone(), ) @@ -1365,6 +1382,7 @@ schema: id, raw, &resolver, + Option::>::None, &LOGGER, SPEC_VERSION_0_0_4.clone(), ) @@ -1441,6 +1459,7 @@ dataSources: id, raw, &resolver, + Option::>::None, &LOGGER, SPEC_VERSION_0_0_4.clone(), ) @@ -1519,6 +1538,7 @@ dataSources: id, raw, &resolver, + Option::>::None, &LOGGER, SPEC_VERSION_0_0_4.clone(), ) @@ -1628,6 +1648,7 @@ dataSources: id, raw, &resolver, + Option::>::None, &LOGGER, SPEC_VERSION_1_2_0.clone(), ) @@ -1701,6 +1722,7 @@ dataSources: id, raw, &resolver, + Option::>::None, &LOGGER, SPEC_VERSION_1_3_0.clone(), ) @@ -1851,6 +1873,7 @@ specVersion: 1.3.0 id, raw, &resolver, + Option::>::None, &LOGGER, SPEC_VERSION_1_3_0.clone(), ) diff --git a/tests/src/fixture/mod.rs b/tests/src/fixture/mod.rs index 0deb580d3b2..b661f9e6d56 100644 --- a/tests/src/fixture/mod.rs +++ b/tests/src/fixture/mod.rs @@ -37,6 +37,7 @@ use graph::http_body_util::Full; use graph::hyper::body::Bytes; use graph::hyper::Request; use graph::ipfs::{IpfsClient, IpfsMetrics}; +use graph::nozzle; use graph::prelude::ethabi::ethereum_types::H256; use graph::prelude::serde_json::{self, json}; use graph::prelude::{ @@ -158,14 +159,19 @@ pub struct TestContext { pub store: Arc, pub deployment: DeploymentLocator, pub subgraph_name: SubgraphName, - pub instance_manager: - Arc>, + pub instance_manager: Arc< + graph_core::subgraph::SubgraphInstanceManager< + graph_store_postgres::SubgraphStore, + nozzle::FlightClient, + >, + >, pub link_resolver: Arc, pub arweave_resolver: Arc, pub env_vars: Arc, pub ipfs: Arc, graphql_runner: Arc, - indexing_status_service: Arc>, + indexing_status_service: + Arc>, } #[derive(Deserialize)] @@ -555,6 +561,7 @@ pub async fn setup_inner( link_resolver.cheap_clone(), ipfs_service, arweave_service, + None, static_filters, )); @@ -588,6 +595,7 @@ pub async fn setup_inner( blockchain_map.cheap_clone(), stores.network_store.cheap_clone(), link_resolver.cheap_clone(), + None, )); let panicking_subscription_manager = Arc::new(PanicSubscriptionManager {}); @@ -598,6 +606,7 @@ pub async fn setup_inner( subgraph_provider.cheap_clone(), subgraph_store.clone(), panicking_subscription_manager, + Option::>::None, blockchain_map.clone(), node_id.clone(), SubgraphVersionSwitchingMode::Instant, From 9a661b2e8dd616c24144717914798866f9feceb7 Mon Sep 17 00:00:00 2001 From: Ion Suman <47307091+isum@users.noreply.github.com> Date: Tue, 28 Oct 2025 17:19:39 +0200 Subject: [PATCH 18/40] feat(graph): add a dedicated Nozzle manifest resolver --- graph/src/data/subgraph/mod.rs | 21 ++++++-- graph/src/nozzle/manifest/mod.rs | 88 ++++++++++++++++++++++++++++++-- 2 files changed, 99 insertions(+), 10 deletions(-) diff --git a/graph/src/data/subgraph/mod.rs b/graph/src/data/subgraph/mod.rs index 3cb52497b0e..3225b10bca8 100644 --- a/graph/src/data/subgraph/mod.rs +++ b/graph/src/data/subgraph/mod.rs @@ -886,7 +886,7 @@ impl UnvalidatedSubgraphManifest { &self.0.spec_version, )); - errors.append(&mut self.validate_nozzle_subgraph()); + errors.append(&mut Self::validate_nozzle_subgraph(&self.0)); match errors.is_empty() { true => Ok(self.0), @@ -898,7 +898,9 @@ impl UnvalidatedSubgraphManifest { &self.0.spec_version } - fn validate_nozzle_subgraph(&self) -> Vec { + fn validate_nozzle_subgraph( + manifest: &SubgraphManifest, + ) -> Vec { use api_version::SPEC_VERSION_1_4_0; let BaseSubgraphManifest { @@ -913,7 +915,7 @@ impl UnvalidatedSubgraphManifest { templates, chain: _, indexer_hints: _, - } = &self.0; + } = manifest; let nozzle_data_sources = data_sources .iter() @@ -1276,7 +1278,7 @@ impl UnresolvedSubgraphManifest { ); } - Ok(SubgraphManifest { + let manifest = SubgraphManifest { id, spec_version, features, @@ -1288,7 +1290,16 @@ impl UnresolvedSubgraphManifest { templates, chain, indexer_hints, - }) + }; + + if let Some(e) = UnvalidatedSubgraphManifest::::validate_nozzle_subgraph(&manifest) + .into_iter() + .next() + { + return Err(anyhow::Error::from(e).into()); + } + + Ok(manifest) } } diff --git a/graph/src/nozzle/manifest/mod.rs b/graph/src/nozzle/manifest/mod.rs index e294a50ddf5..c0b83744ea9 100644 --- a/graph/src/nozzle/manifest/mod.rs +++ b/graph/src/nozzle/manifest/mod.rs @@ -1,21 +1,99 @@ pub mod data_source; -use crate::schema::InputSchema; +use std::sync::Arc; + +use anyhow::{bail, Context, Result}; +use itertools::Itertools; +use semver::Version; +use slog::Logger; + +use crate::{ + blockchain::Blockchain, + cheap_clone::CheapClone as _, + components::link_resolver::LinkResolver, + data::subgraph::{BaseSubgraphManifest, DeploymentHash, UnresolvedSubgraphManifest}, + data_source::DataSource as GenericDataSource, + nozzle::Client, + schema::InputSchema, +}; pub use self::data_source::DataSource; -/// Represents a valid Nozzle Subgraph manifest. +/// Represents a valid Nozzle subgraph manifest. /// /// This manifest contains parsed, formatted, and resolved data. #[derive(Debug, Clone)] pub struct Manifest { - /// The schema of the Subgraph. + /// The schema of the subgraph. /// /// Contains all the entities, aggregations, and relationships between them. pub schema: InputSchema, - /// The Nozzle data sources of the Subgraph. + /// The Nozzle data sources of the subgraph. /// - /// A Nozzle Subgraph can only contain Nozzle data sources. + /// A Nozzle subgraph can only contain Nozzle data sources. pub data_sources: Vec, } + +impl Manifest { + /// Resolves and returns a valid Nozzle subgraph manifest. + pub async fn resolve( + logger: &Logger, + link_resolver: Arc, + nozzle_client: Arc, + max_spec_version: Version, + deployment: DeploymentHash, + raw_manifest: serde_yaml::Mapping, + ) -> Result { + let unresolved_manifest = + UnresolvedSubgraphManifest::::parse(deployment.cheap_clone(), raw_manifest) + .context("failed to parse subgraph manifest")?; + + let resolved_manifest = unresolved_manifest + .resolve( + &deployment, + &link_resolver, + Some(nozzle_client), + logger, + max_spec_version, + ) + .await + .context("failed to resolve subgraph manifest")?; + + let BaseSubgraphManifest { + id: _, + spec_version: _, + features: _, + description: _, + repository: _, + schema, + data_sources, + graft: _, + templates: _, + chain: _, + indexer_hints: _, + } = resolved_manifest; + + let data_sources_count = data_sources.len(); + let nozzle_data_sources = data_sources + .into_iter() + .filter_map(|data_source| match data_source { + GenericDataSource::Nozzle(nozzle_data_source) => Some(nozzle_data_source), + _ => None, + }) + .collect_vec(); + + if nozzle_data_sources.is_empty() { + bail!("invalid subgraph manifest: failed to find Nozzle data sources"); + } + + if nozzle_data_sources.len() != data_sources_count { + bail!("invalid subgraph manifest: only Nozzle data sources are allowed"); + } + + Ok(Self { + schema, + data_sources: nozzle_data_sources, + }) + } +} From f7cc3ba70ea18db7f625d47858f9c9f95354b00d Mon Sep 17 00:00:00 2001 From: Ion Suman <47307091+isum@users.noreply.github.com> Date: Tue, 28 Oct 2025 17:19:39 +0200 Subject: [PATCH 19/40] feat(node): add shutdown token --- node/src/main.rs | 44 ++++++++++++++++++++++++++++++++++++++++---- 1 file changed, 40 insertions(+), 4 deletions(-) diff --git a/node/src/main.rs b/node/src/main.rs index 795b28e05aa..1bc9a0329c9 100644 --- a/node/src/main.rs +++ b/node/src/main.rs @@ -1,11 +1,9 @@ use clap::Parser as _; use git_testament::git_testament; - -use graph::prelude::*; -use graph::{env::EnvVars, log::logger}; - +use graph::{env::EnvVars, log::logger, prelude::*}; use graph_core::polling_monitor::ipfs_service; use graph_node::{launcher, opt}; +use tokio_util::sync::CancellationToken; git_testament!(TESTAMENT); @@ -27,6 +25,8 @@ fn main() { async fn main_inner() { env_logger::init(); + + let _cancel_token = shutdown_token(); let env_vars = Arc::new(EnvVars::from_env().unwrap()); let opt = opt::Opt::parse(); @@ -64,3 +64,39 @@ async fn main_inner() { ) .await; } + +fn shutdown_token() -> CancellationToken { + use tokio::signal; + + let cancel_token = CancellationToken::new(); + let cancel_token_clone = cancel_token.clone(); + + async fn shutdown_signal_handler() { + let ctrl_c = async { + signal::ctrl_c().await.unwrap(); + }; + + #[cfg(unix)] + let terminate = async { + signal::unix::signal(signal::unix::SignalKind::terminate()) + .unwrap() + .recv() + .await; + }; + + #[cfg(not(unix))] + let terminate = std::future::pending::<()>(); + + tokio::select! { + _ = ctrl_c => {}, + _ = terminate => {}, + }; + } + + tokio::spawn(async move { + shutdown_signal_handler().await; + cancel_token_clone.cancel(); + }); + + cancel_token +} From 4c747b93011b1320c104ae1a24c332571e96fe91 Mon Sep 17 00:00:00 2001 From: Ion Suman <47307091+isum@users.noreply.github.com> Date: Tue, 28 Oct 2025 17:19:39 +0200 Subject: [PATCH 20/40] feat(core, graph): add Nozzle subgraph runner --- Cargo.lock | 27 +- core/Cargo.toml | 3 + core/src/lib.rs | 1 + core/src/nozzle_subgraph/manager.rs | 171 ++++++ core/src/nozzle_subgraph/metrics.rs | 36 ++ core/src/nozzle_subgraph/mod.rs | 8 + core/src/nozzle_subgraph/monitor.rs | 573 ++++++++++++++++++ core/src/nozzle_subgraph/runner/compat.rs | 50 ++ core/src/nozzle_subgraph/runner/context.rs | 96 +++ .../nozzle_subgraph/runner/data_processing.rs | 231 +++++++ .../src/nozzle_subgraph/runner/data_stream.rs | 182 ++++++ core/src/nozzle_subgraph/runner/error.rs | 43 ++ .../nozzle_subgraph/runner/latest_blocks.rs | 181 ++++++ core/src/nozzle_subgraph/runner/mod.rs | 140 +++++ .../nozzle_subgraph/runner/reorg_handler.rs | 162 +++++ gnd/Cargo.toml | 3 +- gnd/src/main.rs | 42 +- graph/Cargo.toml | 1 + graph/src/blockchain/types.rs | 6 + graph/src/cheap_clone.rs | 3 + graph/src/components/store/err.rs | 6 + node/src/launcher.rs | 27 +- node/src/main.rs | 3 +- node/src/manager/commands/run.rs | 35 +- 24 files changed, 2005 insertions(+), 25 deletions(-) create mode 100644 core/src/nozzle_subgraph/manager.rs create mode 100644 core/src/nozzle_subgraph/metrics.rs create mode 100644 core/src/nozzle_subgraph/mod.rs create mode 100644 core/src/nozzle_subgraph/monitor.rs create mode 100644 core/src/nozzle_subgraph/runner/compat.rs create mode 100644 core/src/nozzle_subgraph/runner/context.rs create mode 100644 core/src/nozzle_subgraph/runner/data_processing.rs create mode 100644 core/src/nozzle_subgraph/runner/data_stream.rs create mode 100644 core/src/nozzle_subgraph/runner/error.rs create mode 100644 core/src/nozzle_subgraph/runner/latest_blocks.rs create mode 100644 core/src/nozzle_subgraph/runner/mod.rs create mode 100644 core/src/nozzle_subgraph/runner/reorg_handler.rs diff --git a/Cargo.lock b/Cargo.lock index 9e5af2832f1..c2ff2af3129 100644 --- a/Cargo.lock +++ b/Cargo.lock @@ -1068,9 +1068,9 @@ dependencies = [ [[package]] name = "bitcoin-io" -version = "0.1.3" +version = "0.1.4" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "0b47c4ab7a93edb0c7198c5535ed9b52b63095f4e9b45279c6736cec4b856baf" +checksum = "2dee39a0ee5b4095224a0cfc6bf4cc1baf0f9624b96b367e53b66d974e51d953" [[package]] name = "bitcoin_hashes" @@ -1620,9 +1620,9 @@ checksum = "02e3f4d783a55c64266d17dc67d2708852235732a100fc40dd9f1051adc64d7b" [[package]] name = "crc" -version = "3.3.0" +version = "3.4.0" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "9710d3b3739c2e349eb44fe848ad0b7c8cb1e42bd87ee49371df2f7acaf3e675" +checksum = "5eb8a2a1cd12ab0d987a5d5e825195d372001a4094a0376319d5a0ad71c1ba0d" dependencies = [ "crc-catalog", ] @@ -2700,6 +2700,7 @@ dependencies = [ "pq-sys", "serde", "tokio", + "tokio-util 0.7.17", ] [[package]] @@ -2784,6 +2785,7 @@ dependencies = [ "tokio", "tokio-retry", "tokio-stream", + "tokio-util 0.7.17", "toml 0.9.7", "tonic", "tonic-build", @@ -2863,10 +2865,13 @@ dependencies = [ name = "graph-core" version = "0.36.0" dependencies = [ + "alloy", "anyhow", + "arrow", "async-trait", "atomic_refcell", "bytes", + "chrono", "cid", "futures 0.3.31", "graph", @@ -3338,9 +3343,9 @@ checksum = "7f24254aa9a54b5c858eaee2f5bccdb46aaf0e486a595ed5fd8f86ba55232a70" [[package]] name = "hex-conservative" -version = "0.2.1" +version = "0.2.2" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "5313b072ce3c597065a808dbf612c4c8e8590bdbf8b579508bf7a762c5eae6cd" +checksum = "fda06d18ac606267c40c04e41b9947729bf8b9efe74bd4e82b61a5f26a510b9f" dependencies = [ "arrayvec 0.7.4", ] @@ -3822,7 +3827,7 @@ source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "4b0f83760fb341a774ed326568e19f5a863af4a952def8c39f9ab92fd95b88e5" dependencies = [ "equivalent", - "hashbrown 0.16.1", + "hashbrown 0.15.2", "serde", "serde_core", ] @@ -8566,18 +8571,18 @@ dependencies = [ [[package]] name = "zerocopy" -version = "0.8.28" +version = "0.8.31" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "43fa6694ed34d6e57407afbccdeecfa268c470a7d2a5b0cf49ce9fcc345afb90" +checksum = "fd74ec98b9250adb3ca554bdde269adf631549f51d8a8f8f0a10b50f1cb298c3" dependencies = [ "zerocopy-derive", ] [[package]] name = "zerocopy-derive" -version = "0.8.28" +version = "0.8.31" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "c640b22cd9817fae95be82f0d2f90b11f7605f6c319d16705c459b27ac2cbc26" +checksum = "d8a8d209fdf45cf5138cbb5a506f6b52522a25afccc534d1475dad8e31105c6a" dependencies = [ "proc-macro2", "quote", diff --git a/core/Cargo.toml b/core/Cargo.toml index 72fc4ad05ea..bbe5695712f 100644 --- a/core/Cargo.toml +++ b/core/Cargo.toml @@ -20,6 +20,9 @@ cid = "0.11.1" anyhow = "1.0" # Dependencies related to Amp subgraphs +alloy.workspace = true +arrow.workspace = true +chrono.workspace = true futures.workspace = true itertools.workspace = true parking_lot.workspace = true diff --git a/core/src/lib.rs b/core/src/lib.rs index 1e9a7bfbdc3..45a4a7896d5 100644 --- a/core/src/lib.rs +++ b/core/src/lib.rs @@ -1,3 +1,4 @@ +pub mod nozzle_subgraph; pub mod polling_monitor; pub mod subgraph; diff --git a/core/src/nozzle_subgraph/manager.rs b/core/src/nozzle_subgraph/manager.rs new file mode 100644 index 00000000000..5f757ce71a6 --- /dev/null +++ b/core/src/nozzle_subgraph/manager.rs @@ -0,0 +1,171 @@ +use std::sync::Arc; + +use alloy::primitives::BlockNumber; +use anyhow::Context; +use async_trait::async_trait; +use graph::{ + components::{ + link_resolver::{LinkResolver, LinkResolverContext}, + metrics::MetricsRegistry, + store::{DeploymentLocator, SubgraphStore}, + subgraph::SubgraphInstanceManager, + }, + env::EnvVars, + log::factory::LoggerFactory, + nozzle, + prelude::CheapClone, +}; +use slog::{debug, error}; +use tokio_util::sync::CancellationToken; + +use super::{runner, Metrics, Monitor}; + +/// Manages Nozzle subgraph runner futures. +/// +/// Creates and schedules Nozzle subgraph runner futures for execution on demand. +/// Also handles stopping previously started Nozzle subgraph runners. +pub struct Manager { + logger_factory: LoggerFactory, + metrics_registry: Arc, + env_vars: Arc, + monitor: Monitor, + subgraph_store: Arc, + link_resolver: Arc, + nozzle_client: Arc, +} + +impl Manager +where + SS: SubgraphStore, + NC: nozzle::Client, +{ + /// Creates a new Nozzle subgraph manager. + pub fn new( + logger_factory: &LoggerFactory, + metrics_registry: Arc, + env_vars: Arc, + cancel_token: &CancellationToken, + subgraph_store: Arc, + link_resolver: Arc, + nozzle_client: Arc, + ) -> Self { + let logger = logger_factory.component_logger("NozzleSubgraphManager", None); + let logger_factory = logger_factory.with_parent(logger); + + let monitor = Monitor::new(&logger_factory, cancel_token); + + Self { + logger_factory, + metrics_registry, + env_vars, + monitor, + subgraph_store, + link_resolver, + nozzle_client, + } + } +} + +#[async_trait] +impl SubgraphInstanceManager for Manager +where + SS: SubgraphStore, + NC: nozzle::Client + Send + Sync + 'static, +{ + async fn start_subgraph( + self: Arc, + deployment: DeploymentLocator, + stop_block: Option, + ) { + let manager = self.cheap_clone(); + + self.monitor.start( + deployment.cheap_clone(), + Box::new(move |cancel_token| { + Box::pin(async move { + let logger = manager.logger_factory.subgraph_logger(&deployment); + + let store = manager + .subgraph_store + .cheap_clone() + .writable(logger.cheap_clone(), deployment.id, Vec::new().into()) + .await + .context("failed to create writable store")?; + + let metrics = Metrics::new( + &logger, + manager.metrics_registry.cheap_clone(), + store.cheap_clone(), + deployment.hash.cheap_clone(), + ); + + let link_resolver = manager + .link_resolver + .for_manifest(&deployment.hash.to_string()) + .context("failed to create link resolver")?; + + let manifest_bytes = link_resolver + .cat( + &LinkResolverContext::new(&deployment.hash, &logger), + &deployment.hash.to_ipfs_link(), + ) + .await + .context("failed to load subgraph manifest")?; + + let raw_manifest = serde_yaml::from_slice(&manifest_bytes) + .context("failed to parse subgraph manifest")?; + + let mut manifest = nozzle::Manifest::resolve::( + &logger, + manager.link_resolver.cheap_clone(), + manager.nozzle_client.cheap_clone(), + manager.env_vars.max_spec_version.cheap_clone(), + deployment.hash.cheap_clone(), + raw_manifest, + ) + .await?; + + if let Some(stop_block) = stop_block { + for data_source in manifest.data_sources.iter_mut() { + data_source.source.end_block = stop_block as BlockNumber; + } + } + + store + .start_subgraph_deployment(&logger) + .await + .context("failed to start subgraph deployment")?; + + let runner_context = runner::Context::new( + &logger, + &manager.env_vars.nozzle, + manager.nozzle_client.cheap_clone(), + store, + deployment.hash.cheap_clone(), + manifest, + metrics, + ); + + let runner_result = runner::new_runner(runner_context)(cancel_token).await; + + match manager.subgraph_store.stop_subgraph(&deployment).await { + Ok(()) => { + debug!(logger, "Subgraph writer stopped"); + } + Err(e) => { + error!(logger, "Failed to stop subgraph writer"; + "e" => ?e + ); + } + } + + runner_result + }) + }), + ); + } + + async fn stop_subgraph(&self, deployment: DeploymentLocator) { + self.monitor.stop(deployment); + } +} diff --git a/core/src/nozzle_subgraph/metrics.rs b/core/src/nozzle_subgraph/metrics.rs new file mode 100644 index 00000000000..6bdb266b337 --- /dev/null +++ b/core/src/nozzle_subgraph/metrics.rs @@ -0,0 +1,36 @@ +use std::sync::Arc; + +use graph::{ + cheap_clone::CheapClone, + components::{ + metrics::{stopwatch::StopwatchMetrics, MetricsRegistry}, + store::WritableStore, + }, + data::subgraph::DeploymentHash, +}; +use slog::Logger; + +/// Contains deployment specific metrics. +pub(super) struct Metrics { + pub(super) stopwatch: StopwatchMetrics, +} + +impl Metrics { + /// Creates new deployment specific metrics. + pub(super) fn new( + logger: &Logger, + metrics_registry: Arc, + store: Arc, + deployment: DeploymentHash, + ) -> Self { + let stopwatch = StopwatchMetrics::new( + logger.cheap_clone(), + deployment, + "nozzle-process", + metrics_registry, + store.shard().to_string(), + ); + + Self { stopwatch } + } +} diff --git a/core/src/nozzle_subgraph/mod.rs b/core/src/nozzle_subgraph/mod.rs new file mode 100644 index 00000000000..3d3846742aa --- /dev/null +++ b/core/src/nozzle_subgraph/mod.rs @@ -0,0 +1,8 @@ +mod manager; +mod metrics; +mod monitor; +mod runner; + +use self::{metrics::Metrics, monitor::Monitor}; + +pub use self::manager::Manager; diff --git a/core/src/nozzle_subgraph/monitor.rs b/core/src/nozzle_subgraph/monitor.rs new file mode 100644 index 00000000000..0d49b8cc1e2 --- /dev/null +++ b/core/src/nozzle_subgraph/monitor.rs @@ -0,0 +1,573 @@ +//! This module is responsible for executing subgraph runner futures. +//! +//! # Terminology used in this module +//! +//! `active subgraph` - A subgraph that was started and is still tracked. +//! `running subgraph` - A subgraph that has an instance that is making progress or stopping. +//! `subgraph instance` - A background process that executes the subgraph runner future. + +use std::{ + collections::{hash_map::Entry, HashMap}, + fmt, + sync::{ + atomic::{AtomicU32, Ordering::SeqCst}, + Arc, + }, + time::Duration, +}; + +use anyhow::Result; +use futures::future::BoxFuture; +use graph::{ + cheap_clone::CheapClone, components::store::DeploymentLocator, log::factory::LoggerFactory, +}; +use slog::{debug, error, info, warn, Logger}; +use tokio::{sync::mpsc, task::JoinHandle, time::timeout}; +use tokio_util::sync::CancellationToken; + +/// Represents the maximum amount of time a subgraph instance is allowed to run +/// after it receives a cancel signal. +/// +/// If a subgraph instance does not complete its execution in this amount of time +/// it is considered unresponsive and is aborted. +const SUBGRAPH_INSTANCE_GRACE_PERIOD: Duration = { + if cfg!(test) { + Duration::from_millis(300) + } else if cfg!(debug_assertions) { + Duration::from_secs(30) + } else { + Duration::from_secs(300) + } +}; + +/// Represents the subgraph runner future. +/// +/// This is the future that performs the subgraph indexing. +/// It is expected to return only on deterministic failures or when indexing is completed. +/// All retry functionality must be handled internally by this future. +pub(super) type BoxRunner = + Box BoxFuture<'static, Result<()>> + Send + 'static>; + +/// Manages the lifecycle of subgraph runners. +/// +/// Ensures that there is at most one subgraph instance running +/// for any subgraph deployment at any point in time. +/// Handles starting, stopping and restarting subgraphs. +pub(super) struct Monitor { + logger_factory: Arc, + + /// Every subgraph instance is assigned a cancel token derived from this token. + /// + /// This means that the `Monitor` can send cancel signals to all subgraph instances at once, + /// and to each subgraph instance individually. + cancel_token: CancellationToken, + + /// The channel that is used to send subgraph commands. + /// + /// Every subgraph start and stop request results in a command that is sent to the + /// background process that manages the subgraph instances. + command_tx: mpsc::UnboundedSender, + + /// When a subgraph starts it is assigned a sequential ID. + /// The ID is then kept in memory in the list of active subgraphs. + /// + /// When the subgraph completes execution it should be removed from the + /// list of active subgraphs, so that it can be restarted. + /// + /// This ID is required to be able to check if the active subgraph + /// is the same subgraph instance that was stopped. + /// + /// If the IDs do not match, it means that the subgraph was force restarted, + /// ignoring the state of the previous subgraph instance, or that the subgraph + /// was restarted after the previous subgraph instance completed its execution + /// but before the remove request was processed. + subgraph_instance_id: Arc, +} + +impl Monitor { + /// Creates a new subgraph monitor. + /// + /// Spawns a background process that manages the subgraph start and stop requests. + /// + /// A new cancel token is derived from the `cancel_token` and only the derived token is used by the + /// subgraph monitor and its background process. + pub(super) fn new(logger_factory: &LoggerFactory, cancel_token: &CancellationToken) -> Self { + let logger = logger_factory.component_logger("NozzleSubgraphMonitor", None); + let logger_factory = Arc::new(logger_factory.with_parent(logger)); + + // A derived token makes sure it is not possible to accidentally cancel the parent token + let cancel_token = cancel_token.child_token(); + + // It is safe to use an unbounded channel here, because it's pretty much unrealistic that the + // command processor will fall behind so much that the channel buffer will take up all the memory. + // The command processor is non-blocking and delegates long-running processes to detached tasks. + let (command_tx, command_rx) = mpsc::unbounded_channel::(); + + tokio::spawn(Self::command_processor( + logger_factory.cheap_clone(), + cancel_token.cheap_clone(), + command_tx.clone(), + command_rx, + )); + + Self { + logger_factory, + cancel_token, + command_tx, + subgraph_instance_id: Arc::new(AtomicU32::new(0)), + } + } + + /// Starts a subgraph. + /// + /// Sends a subgraph start request to this subgraph monitor that + /// eventually starts the subgraph. + /// + /// # Behaviour + /// + /// - If the subgraph is not active, it starts when the request is processed + /// - If the subgraph is active, it stops, and then restarts + /// - Ensures that there is only one subgraph instance for this subgraph deployment + /// - Multiple consecutive calls in a short time period force restart the subgraph, + /// aborting the active subgraph instance + pub(super) fn start(&self, deployment: DeploymentLocator, runner: BoxRunner) { + let logger = self + .logger_factory + .subgraph_logger(&deployment) + .new(slog::o!("method" => "start")); + + info!(logger, "Starting subgraph"); + handle_send_result( + &logger, + self.command_tx.send(Command::Start { + id: self.subgraph_instance_id.fetch_add(1, SeqCst), + deployment, + runner, + }), + ); + } + + /// Stops the subgraph. + /// + /// Sends a subgraph stop request to this subgraph monitor that + /// eventually stops the subgraph. + /// + /// # Behaviour + /// + /// - If the subgraph is not active does nothing + /// - If the subgraph is active, sends a cancel signal that gracefully stops the subgraph + /// - If the subgraph fails to stop after an extended period of time it aborts + pub(super) fn stop(&self, deployment: DeploymentLocator) { + let logger = self + .logger_factory + .subgraph_logger(&deployment) + .new(slog::o!("method" => "stop")); + + info!(logger, "Stopping subgraph"); + handle_send_result(&logger, self.command_tx.send(Command::Stop { deployment })); + } + + /// Processes commands sent through the command channel. + /// + /// Tracks active subgraphs and keeps a list of pending start commands. + /// Pending start commands are start commands that execute after the related subgraph stops. + async fn command_processor( + logger_factory: Arc, + cancel_token: CancellationToken, + command_tx: mpsc::UnboundedSender, + mut command_rx: mpsc::UnboundedReceiver, + ) { + let logger = logger_factory.component_logger("CommandProcessor", None); + let mut subgraph_instances: HashMap = HashMap::new(); + let mut pending_start_commands: HashMap = HashMap::new(); + + loop { + tokio::select! { + Some(command) = command_rx.recv() => { + debug!(logger, "Processing a new command"; + "command" => ?command + ); + + match &command { + Command::Start { .. } => { + Self::process_start_command( + &logger_factory, + &cancel_token, + &mut subgraph_instances, + &mut pending_start_commands, + &command_tx, + command + ); + }, + Command::Stop { .. } => { + Self::process_stop_command( + &logger_factory, + &mut subgraph_instances, + &mut pending_start_commands, + command + ); + }, + Command::Clear { .. } => { + Self::process_clear_command( + &logger_factory, + &mut subgraph_instances, + &mut pending_start_commands, + &command_tx, + command + ); + }, + } + }, + _ = cancel_token.cancelled() => { + debug!(logger, "Stopping command processor"); + + // All active Subgraphs will shutdown gracefully + // because their cancel tokens are derived from this cancelled token. + return; + } + } + } + } + + /// Starts a subgraph. + /// + /// # Behaviour + /// + /// - If the subgraph is not active, it starts right away + /// - If the subgraph is active, a cancel signal is sent to the active subgraph instance + /// and this start request is stored in the list of pending start commands + /// - If the subgraph is active and there is already a pending start command, + /// the active subgraph instance aborts, and the subgraph force restarts right away + /// - If the subgraph is active, but its instance is not actually running, + /// the subgraph starts right away + fn process_start_command( + logger_factory: &LoggerFactory, + cancel_token: &CancellationToken, + subgraph_instances: &mut HashMap, + pending_start_commands: &mut HashMap, + command_tx: &mpsc::UnboundedSender, + command: Command, + ) { + let Command::Start { + id, + deployment, + runner, + } = command + else { + unreachable!(); + }; + + let logger = logger_factory.subgraph_logger(&deployment); + let command_logger = logger.new(slog::o!("command" => "start")); + + let cancel_token = cancel_token.child_token(); + let pending_start_command = pending_start_commands.remove(&deployment); + + match subgraph_instances.entry(deployment.cheap_clone()) { + Entry::Vacant(entry) => { + debug!(command_logger, "Subgraph is not active, starting"); + + let subgraph_instance = Self::start_subgraph( + logger, + cancel_token, + id, + deployment, + runner, + command_tx.clone(), + ); + + entry.insert(subgraph_instance); + } + Entry::Occupied(mut entry) => { + let subgraph_instance = entry.get_mut(); + subgraph_instance.cancel_token.cancel(); + + if pending_start_command.is_some() { + debug!(command_logger, "Subgraph is active, force restarting"); + + subgraph_instance.handle.abort(); + + *subgraph_instance = Self::start_subgraph( + logger, + cancel_token, + id, + deployment, + runner, + command_tx.clone(), + ); + + return; + } + + if subgraph_instance.handle.is_finished() { + debug!(command_logger, "Subgraph is not running, starting"); + + *subgraph_instance = Self::start_subgraph( + logger, + cancel_token, + id, + deployment, + runner, + command_tx.clone(), + ); + + return; + } + + debug!(command_logger, "Gracefully restarting subgraph"); + + pending_start_commands.insert( + deployment.cheap_clone(), + Command::Start { + id, + deployment, + runner, + }, + ); + } + } + } + + /// Stops a subgraph. + /// + /// # Behaviour + /// + /// - If the subgraph is not active, does nothing + /// - If the subgraph is active, sends a cancel signal to the active subgraph instance + fn process_stop_command( + logger_factory: &LoggerFactory, + subgraph_instances: &mut HashMap, + pending_start_commands: &mut HashMap, + command: Command, + ) { + let Command::Stop { deployment } = command else { + unreachable!(); + }; + + let logger = logger_factory + .subgraph_logger(&deployment) + .new(slog::o!("command" => "stop")); + + if let Some(subgraph_instance) = subgraph_instances.get(&deployment) { + debug!(logger, "Sending cancel signal"); + subgraph_instance.cancel_token.cancel(); + } else { + debug!(logger, "Subgraph is not active"); + } + + pending_start_commands.remove(&deployment); + } + + /// Removes a subgraph from the list of active subgraphs allowing the subgraph to be restarted. + fn process_clear_command( + logger_factory: &LoggerFactory, + subgraph_instances: &mut HashMap, + pending_start_commands: &mut HashMap, + command_tx: &mpsc::UnboundedSender, + command: Command, + ) { + let Command::Clear { id, deployment } = command else { + unreachable!(); + }; + + let logger = logger_factory + .subgraph_logger(&deployment) + .new(slog::o!("command" => "clear")); + + match subgraph_instances.get(&deployment) { + Some(subgraph_instance) if subgraph_instance.id == id => { + debug!(logger, "Removing active subgraph"); + subgraph_instances.remove(&deployment); + } + Some(_subgraph_instance) => { + debug!(logger, "Active subgraph does not need to be removed"); + return; + } + None => { + debug!(logger, "Subgraph is not active"); + } + } + + if let Some(pending_start_command) = pending_start_commands.remove(&deployment) { + debug!(logger, "Resending a pending start command"); + handle_send_result(&logger, command_tx.send(pending_start_command)); + } + } + + /// Spawns a background process that executes the subgraph runner future. + /// + /// An additional background process is spawned to handle the graceful shutdown of the subgraph runner, + /// and to ensure correct behaviour even if the subgraph runner panics. + fn start_subgraph( + logger: Logger, + cancel_token: CancellationToken, + id: u32, + deployment: DeploymentLocator, + runner: BoxRunner, + command_tx: mpsc::UnboundedSender, + ) -> SubgraphInstance { + let mut runner_handle = tokio::spawn({ + let logger = logger.new(slog::o!("process" => "subgraph_runner")); + let cancel_token = cancel_token.cheap_clone(); + + async move { + info!(logger, "Subgraph started"); + + match runner(cancel_token).await { + Ok(()) => { + info!(logger, "Subgraph stopped"); + } + Err(e) => { + error!(logger, "Subgraph failed"; + "error" => ?e + ); + } + } + } + }); + + let supervisor_handle = tokio::spawn({ + let logger = logger.new(slog::o!("process" => "subgraph_supervisor")); + let cancel_token = cancel_token.cheap_clone(); + + fn handle_runner_result(logger: &Logger, result: Result<(), tokio::task::JoinError>) { + match result { + Ok(()) => { + debug!(logger, "Subgraph completed execution"); + } + Err(e) if e.is_panic() => { + error!(logger, "Subgraph panicked"; + "error" => ?e + ); + + // TODO: Maybe abort the entire process on panic and require a full graph-node restart. + // Q: Should a bug that is triggered in a specific subgraph affect everything? + // Q: How to make this failure loud enough so it is not missed? + // + // println!("Subgraph panicked"); + // std::process::abort(); + } + Err(e) => { + error!(logger, "Subgraph failed"; + "error" => ?e + ); + } + } + } + + async move { + debug!(logger, "Subgraph supervisor started"); + + tokio::select! { + _ = cancel_token.cancelled() => { + debug!(logger, "Received cancel signal, waiting for subgraph to stop"); + + match timeout(SUBGRAPH_INSTANCE_GRACE_PERIOD, &mut runner_handle).await { + Ok(result) => { + handle_runner_result(&logger, result); + }, + Err(_) => { + warn!(logger, "Subgraph did not stop after grace period, aborting"); + + runner_handle.abort(); + let _ = runner_handle.await; + + warn!(logger, "Subgraph aborted"); + } + } + }, + result = &mut runner_handle => { + handle_runner_result(&logger, result); + cancel_token.cancel(); + } + } + + debug!(logger, "Sending clear command"); + handle_send_result(&logger, command_tx.send(Command::Clear { id, deployment })); + } + }); + + SubgraphInstance { + id, + handle: supervisor_handle, + cancel_token, + } + } +} + +impl Drop for Monitor { + fn drop(&mut self) { + // Send cancel signals to all active subgraphs so that they don't remain without an associated monitor + self.cancel_token.cancel(); + } +} + +/// Represents a background process that executes the subgraph runner future. +struct SubgraphInstance { + id: u32, + handle: JoinHandle<()>, + cancel_token: CancellationToken, +} + +/// Every command used by the subgraph monitor. +enum Command { + /// A request to start executing the subgraph runner future. + Start { + id: u32, + deployment: DeploymentLocator, + runner: BoxRunner, + }, + + /// A request to stop executing the subgraph runner future. + Stop { deployment: DeploymentLocator }, + + /// A request to remove the subgraph from the list of active subgraphs. + Clear { + id: u32, + deployment: DeploymentLocator, + }, +} + +impl fmt::Debug for Command { + fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result { + match self { + Self::Start { + id, + deployment, + runner: _, + } => f + .debug_struct("Start") + .field("id", id) + .field("deployment", deployment) + .finish_non_exhaustive(), + Self::Stop { deployment } => f + .debug_struct("Stop") + .field("deployment", deployment) + .finish(), + Self::Clear { id, deployment } => f + .debug_struct("Clear") + .field("id", id) + .field("deployment", deployment) + .finish(), + } + } +} + +fn handle_send_result( + logger: &Logger, + result: Result<(), tokio::sync::mpsc::error::SendError>, +) { + match result { + Ok(()) => { + debug!(logger, "Command was sent successfully"); + } + + // This should only happen if the parent cancel token of the subgraph monitor was cancelled + Err(e) => { + error!(logger, "Failed to send command"; + "command" => ?e.0, + "error" => ?e + ); + } + } +} diff --git a/core/src/nozzle_subgraph/runner/compat.rs b/core/src/nozzle_subgraph/runner/compat.rs new file mode 100644 index 00000000000..c695238416c --- /dev/null +++ b/core/src/nozzle_subgraph/runner/compat.rs @@ -0,0 +1,50 @@ +use alloy::primitives::{BlockHash, BlockNumber}; +use chrono::{DateTime, Utc}; + +mod legacy { + pub(super) use graph::{ + blockchain::{BlockHash, BlockPtr, BlockTime}, + components::store::BlockNumber, + data::store::scalar::Timestamp, + }; +} + +pub(super) trait Compat { + fn compat(&self) -> T; +} + +impl Compat for BlockNumber { + fn compat(&self) -> legacy::BlockNumber { + (*self).try_into().unwrap() + } +} + +impl Compat for legacy::BlockNumber { + fn compat(&self) -> BlockNumber { + (*self).try_into().unwrap() + } +} + +impl Compat for BlockHash { + fn compat(&self) -> legacy::BlockHash { + legacy::BlockHash(self.0.into()) + } +} + +impl Compat for legacy::BlockHash { + fn compat(&self) -> BlockHash { + BlockHash::from_slice(&self.0) + } +} + +impl Compat for DateTime { + fn compat(&self) -> legacy::BlockTime { + legacy::Timestamp(*self).into() + } +} + +impl Compat for (BlockNumber, BlockHash) { + fn compat(&self) -> legacy::BlockPtr { + legacy::BlockPtr::new(self.1.compat(), self.0.compat()) + } +} diff --git a/core/src/nozzle_subgraph/runner/context.rs b/core/src/nozzle_subgraph/runner/context.rs new file mode 100644 index 00000000000..7d7ad1db741 --- /dev/null +++ b/core/src/nozzle_subgraph/runner/context.rs @@ -0,0 +1,96 @@ +use std::sync::Arc; + +use alloy::primitives::{BlockHash, BlockNumber}; +use graph::{ + cheap_clone::CheapClone, + components::store::WritableStore, + data::subgraph::DeploymentHash, + env::NozzleEnv, + nozzle::{log::Logger as _, Codec, Manifest}, + util::backoff::ExponentialBackoff, +}; +use slog::Logger; + +use super::Compat; +use crate::nozzle_subgraph::Metrics; + +pub(in super::super) struct Context { + pub(super) logger: Logger, + pub(super) client: Arc, + pub(super) store: Arc, + pub(super) max_buffer_size: usize, + pub(super) max_block_range: usize, + pub(super) backoff: ExponentialBackoff, + pub(super) deployment: DeploymentHash, + pub(super) manifest: Manifest, + pub(super) metrics: Metrics, + pub(super) codec: Codec, +} + +impl Context { + pub(in super::super) fn new( + logger: &Logger, + env: &NozzleEnv, + client: Arc, + store: Arc, + deployment: DeploymentHash, + manifest: Manifest, + metrics: Metrics, + ) -> Self { + let logger = logger.component("NozzleSubgraphRunner"); + let backoff = ExponentialBackoff::new(env.query_retry_min_delay, env.query_retry_max_delay); + let codec = Codec::new(manifest.schema.cheap_clone()); + + Self { + logger, + client, + store, + max_buffer_size: env.max_buffer_size, + max_block_range: env.max_block_range, + backoff, + deployment, + manifest, + metrics, + codec, + } + } + + pub(super) fn indexing_completed(&self) -> bool { + let Some(last_synced_block) = self.latest_synced_block() else { + return false; + }; + + self.manifest + .data_sources + .iter() + .all(|data_source| last_synced_block >= data_source.source.end_block) + } + + pub(super) fn latest_synced_block(&self) -> Option { + self.latest_synced_block_ptr() + .map(|(block_number, _)| block_number) + } + + pub(super) fn latest_synced_block_ptr(&self) -> Option<(BlockNumber, BlockHash)> { + self.store + .block_ptr() + .map(|block_ptr| (block_ptr.number.compat(), block_ptr.hash.compat())) + } + + pub(super) fn total_queries(&self) -> usize { + self.manifest + .data_sources + .iter() + .map(|data_source| data_source.transformer.tables.len()) + .sum() + } + + pub(super) fn min_start_block(&self) -> BlockNumber { + self.manifest + .data_sources + .iter() + .map(|data_source| data_source.source.start_block) + .min() + .unwrap() + } +} diff --git a/core/src/nozzle_subgraph/runner/data_processing.rs b/core/src/nozzle_subgraph/runner/data_processing.rs new file mode 100644 index 00000000000..b893d1f232f --- /dev/null +++ b/core/src/nozzle_subgraph/runner/data_processing.rs @@ -0,0 +1,231 @@ +use std::sync::Arc; + +use alloy::primitives::{BlockHash, BlockNumber}; +use anyhow::anyhow; +use arrow::array::RecordBatch; +use chrono::{DateTime, Utc}; +use graph::{ + blockchain::block_stream::FirehoseCursor, + cheap_clone::CheapClone, + components::store::{EntityCache, ModificationsAndCache}, + nozzle::{ + codec::{utils::auto_block_timestamp_decoder, DecodeOutput, DecodedEntity, Decoder}, + stream_aggregator::{RecordBatchGroup, RecordBatchGroups, StreamRecordBatch}, + }, +}; +use slog::{debug, trace}; + +use super::{data_stream::TablePtr, Compat, Context, Error}; + +pub(super) async fn process_record_batch_groups( + cx: &mut Context, + mut entity_cache: EntityCache, + record_batch_groups: RecordBatchGroups, + stream_table_ptr: Arc<[TablePtr]>, + latest_block: BlockNumber, +) -> Result { + let from_block = record_batch_groups + .first_key_value() + .map(|((block, _), _)| *block); + + let to_block = record_batch_groups + .last_key_value() + .map(|((block, _), _)| *block); + + debug!(cx.logger, "Processing record batch groups"; + "from_block" => ?from_block, + "to_block" => ?to_block + ); + + for ((block_number, block_hash), record_batch_group) in record_batch_groups { + trace!(cx.logger, "Processing record batch group"; + "block" => block_number, + "record_batches_count" => record_batch_group.record_batches.len() + ); + + entity_cache = process_record_batch_group( + cx, + entity_cache, + block_number, + block_hash, + record_batch_group, + &stream_table_ptr, + latest_block, + ) + .await + .map_err(|e| { + e.context(format!( + "failed to process record batch group at block '{block_number}'" + )) + })?; + + trace!(cx.logger, "Completed processing record batch group"; + "block" => block_number + ); + } + + debug!(cx.logger, "Completed processing record batch groups"; + "from_block" => ?from_block, + "to_block" => ?to_block + ); + + Ok(entity_cache) +} + +async fn process_record_batch_group( + cx: &mut Context, + mut entity_cache: EntityCache, + block_number: BlockNumber, + block_hash: BlockHash, + record_batch_group: RecordBatchGroup, + stream_table_ptr: &[TablePtr], + latest_block: BlockNumber, +) -> Result { + let RecordBatchGroup { record_batches } = record_batch_group; + + if record_batches.is_empty() { + debug!(cx.logger, "Record batch group is empty"); + return Ok(entity_cache); + } + + let block_timestamp = decode_block_timestamp(&record_batches) + .map_err(|e| e.context("failed to decode block timestamp"))?; + + for record_batch in record_batches { + let StreamRecordBatch { + stream_index, + record_batch, + } = record_batch; + + process_record_batch( + cx, + &mut entity_cache, + block_number, + record_batch, + stream_table_ptr[stream_index], + ) + .await + .map_err(|e| { + e.context(format!( + "failed to process record batch for stream '{stream_index}'" + )) + })?; + } + + let ModificationsAndCache { + modifications, + entity_lfu_cache, + evict_stats: _, + } = entity_cache + .as_modifications(block_number.compat()) + .map_err(Error::from) + .map_err(|e| e.context("failed to extract entity modifications from the state"))?; + + let is_close_to_chain_head = latest_block.saturating_sub(block_number) <= 100; + + cx.store + .transact_block_operations( + (block_number, block_hash).compat(), + block_timestamp.compat(), + FirehoseCursor::None, + modifications, + &cx.metrics.stopwatch, + Vec::new(), + Vec::new(), + Vec::new(), + false, + is_close_to_chain_head, + ) + .await + .map_err(Error::from) + .map_err(|e| e.context("failed to transact block operations"))?; + + Ok(EntityCache::with_current( + cx.store.cheap_clone(), + entity_lfu_cache, + )) +} + +async fn process_record_batch( + cx: &mut Context, + entity_cache: &mut EntityCache, + block_number: BlockNumber, + record_batch: RecordBatch, + (i, j): TablePtr, +) -> Result<(), Error> { + let table = &cx.manifest.data_sources[i].transformer.tables[j]; + let entity_name = &table.name; + + let DecodeOutput { + entity_type, + id_type, + decoded_entities, + } = cx + .codec + .decode(record_batch, entity_name.as_str()) + .map_err(|e| { + Error::Deterministic( + e.context(format!("failed to decode entities of type '{entity_name}'")), + ) + })?; + + for decoded_entity in decoded_entities { + let DecodedEntity { + key, + mut entity_data, + } = decoded_entity; + + let key = match key { + Some(key) => key, + None => { + let entity_id = entity_cache + .generate_id(id_type, block_number.compat()) + .map_err(|e| { + Error::Deterministic(e.context(format!( + "failed to generate a new id for an entity of type '{entity_name}'" + ))) + })?; + + entity_data.push(("id".into(), entity_id.clone().into())); + entity_type.key(entity_id) + } + }; + + let entity_id = key.entity_id.clone(); + let entity = cx.manifest.schema.make_entity(entity_data).map_err(|e| { + Error::Deterministic(anyhow!(e).context(format!( + "failed to create a new entity of type '{entity_name}' with id '{entity_id}'" + ))) + })?; + + entity_cache + .set(key, entity, block_number.compat(), None) + .map_err(|e| { + Error::Deterministic(e.context(format!( + "failed to store a new entity of type '{entity_name}' with id '{entity_id}'" + ))) + })?; + } + + Ok(()) +} + +fn decode_block_timestamp(record_batches: &[StreamRecordBatch]) -> Result, Error> { + let mut last_error: Option = None; + + for record_batch in record_batches { + match auto_block_timestamp_decoder(&record_batch.record_batch) { + Ok(decoder) => { + return decoder + .decode(0) + .map_err(|e| Error::Deterministic(e))? + .ok_or_else(|| Error::Deterministic(anyhow!("block timestamp is empty"))); + } + Err(e) => { + last_error = Some(Error::Deterministic(e)); + } + } + } + + Err(last_error.unwrap()) +} diff --git a/core/src/nozzle_subgraph/runner/data_stream.rs b/core/src/nozzle_subgraph/runner/data_stream.rs new file mode 100644 index 00000000000..f09f332068d --- /dev/null +++ b/core/src/nozzle_subgraph/runner/data_stream.rs @@ -0,0 +1,182 @@ +use std::{collections::HashMap, ops::RangeInclusive, sync::Arc}; + +use alloy::primitives::BlockNumber; +use anyhow::anyhow; +use futures::{ + stream::{empty, BoxStream}, + StreamExt, TryStreamExt, +}; +use graph::{ + cheap_clone::CheapClone, + nozzle::{ + manifest::DataSource, + stream_aggregator::{RecordBatchGroups, StreamAggregator}, + Client, + }, +}; +use slog::{debug, warn}; + +use super::{Context, Error}; + +pub(super) type TablePtr = (usize, usize); + +pub(super) fn new_data_stream( + cx: &Context, + latest_block: BlockNumber, +) -> BoxStream<'static, Result<(RecordBatchGroups, Arc<[TablePtr]>), Error>> +where + NC: Client, +{ + let logger = cx.logger.new(slog::o!("process" => "new_data_stream")); + + let total_queries = cx.total_queries(); + let mut total_queries_to_execute = 0; + let mut data_streams = Vec::new(); + let mut latest_queried_block = cx.latest_synced_block(); + let mut max_end_block = BlockNumber::MIN; + + debug!(logger, "Creating data stream"; + "from_block" => latest_queried_block.unwrap_or(BlockNumber::MIN), + "to_block" => latest_block, + "min_start_block" => cx.min_start_block(), + "max_block_range" => cx.max_block_range, + ); + + loop { + let next_block_ranges = next_block_ranges(&cx, latest_queried_block, latest_block); + + if next_block_ranges.is_empty() { + if data_streams.is_empty() { + warn!(logger, "There are no unprocessed block ranges"); + } + break; + } + + let mut query_streams = Vec::with_capacity(total_queries); + let mut query_streams_table_ptr = Vec::with_capacity(total_queries); + let mut min_start_block = BlockNumber::MAX; + + for (i, data_source) in cx.manifest.data_sources.iter().enumerate() { + let Some(block_range) = next_block_ranges.get(&i) else { + continue; + }; + + if *block_range.start() < min_start_block { + min_start_block = *block_range.start(); + } + + if *block_range.end() > max_end_block { + max_end_block = *block_range.end(); + } + + for (j, table) in data_source.transformer.tables.iter().enumerate() { + let query = table.query.with_block_range_filter(block_range); + + query_streams.push(cx.client.query(&cx.logger, query, None)); + query_streams_table_ptr.push((i, j)); + } + } + + let query_streams_table_ptr: Arc<[TablePtr]> = query_streams_table_ptr.into(); + total_queries_to_execute += query_streams.len(); + + data_streams.push( + StreamAggregator::new(&cx.logger, query_streams, cx.max_buffer_size) + .map_ok(move |response| (response, query_streams_table_ptr.cheap_clone())) + .map_err(Error::from) + .and_then(move |response| async move { + if let Some(((first_block, _), _)) = response.0.first_key_value() { + if *first_block < min_start_block { + return Err(Error::NonDeterministic(anyhow!("chain reorg"))); + } + } + + Ok(response) + }) + .boxed(), + ); + + if max_end_block >= latest_block { + break; + } + + latest_queried_block = Some(max_end_block); + } + + debug!(logger, "Created aggregated data streams"; + "total_data_streams" => data_streams.len(), + "total_queries_to_execute" => total_queries_to_execute + ); + + let mut iter = data_streams.into_iter(); + let mut merged_data_stream = iter.next().unwrap_or_else(|| empty().boxed()); + + for data_stream in iter { + merged_data_stream = merged_data_stream.chain(data_stream).boxed(); + } + + merged_data_stream +} + +fn next_block_ranges( + cx: &Context, + latest_queried_block: Option, + latest_block: BlockNumber, +) -> HashMap> { + let block_ranges = cx + .manifest + .data_sources + .iter() + .enumerate() + .filter_map(|(i, data_source)| { + next_block_range(cx, data_source, latest_queried_block, latest_block) + .map(|block_range| (i, block_range)) + }) + .collect::>(); + + let Some(min_block_range) = block_ranges + .iter() + .min_by_key(|(_, block_range)| *block_range.start()) + .map(|(_, min_block_range)| min_block_range.clone()) + else { + return HashMap::new(); + }; + + block_ranges + .into_iter() + .filter(|(_, block_range)| block_range.start() <= min_block_range.end()) + .collect() +} + +fn next_block_range( + cx: &Context, + data_source: &DataSource, + latest_queried_block: Option, + latest_block: BlockNumber, +) -> Option> { + let start_block = match latest_queried_block { + Some(latest_queried_block) => { + if latest_queried_block >= data_source.source.end_block { + return None; + } + + latest_queried_block + 1 + } + None => data_source.source.start_block, + }; + + let end_block = [ + start_block.saturating_add(cx.max_block_range as BlockNumber), + data_source.source.end_block, + latest_block, + ] + .into_iter() + .min() + .unwrap(); + + if start_block > end_block { + return None; + } + + Some(start_block..=end_block) +} diff --git a/core/src/nozzle_subgraph/runner/error.rs b/core/src/nozzle_subgraph/runner/error.rs new file mode 100644 index 00000000000..a59ecc30576 --- /dev/null +++ b/core/src/nozzle_subgraph/runner/error.rs @@ -0,0 +1,43 @@ +use graph::nozzle::error::IsDeterministic; +use thiserror::Error; + +#[derive(Debug, Error)] +pub(super) enum Error { + #[error("runner failed with a non-deterministic error: {0:#}")] + NonDeterministic(#[source] anyhow::Error), + + #[error("runner failed with a deterministic error: {0:#}")] + Deterministic(#[source] anyhow::Error), +} + +impl Error { + pub(super) fn context(self, context: C) -> Self + where + C: std::fmt::Display + Send + Sync + 'static, + { + match self { + Self::NonDeterministic(e) => Self::NonDeterministic(e.context(context)), + Self::Deterministic(e) => Self::Deterministic(e.context(context)), + } + } + + pub(super) fn is_deterministic(&self) -> bool { + match self { + Self::Deterministic(_) => true, + Self::NonDeterministic(_) => false, + } + } +} + +impl From for Error +where + T: std::error::Error + IsDeterministic + Send + Sync + 'static, +{ + fn from(e: T) -> Self { + if e.is_deterministic() { + Self::Deterministic(e.into()) + } else { + Self::NonDeterministic(e.into()) + } + } +} diff --git a/core/src/nozzle_subgraph/runner/latest_blocks.rs b/core/src/nozzle_subgraph/runner/latest_blocks.rs new file mode 100644 index 00000000000..55a4ef2dc93 --- /dev/null +++ b/core/src/nozzle_subgraph/runner/latest_blocks.rs @@ -0,0 +1,181 @@ +use alloy::primitives::BlockNumber; +use anyhow::anyhow; +use arrow::array::RecordBatch; +use futures::{future::try_join_all, stream::BoxStream, StreamExt, TryFutureExt}; +use graph::{ + cheap_clone::CheapClone, + nozzle::{ + client::ResponseBatch, + codec::{utils::block_number_decoder, Decoder}, + common::Ident, + error::IsDeterministic, + manifest::DataSource, + Client, + }, +}; +use itertools::Itertools; +use slog::debug; + +use super::{Context, Error}; + +pub(super) type TablePtr = (usize, usize); + +pub(super) struct LatestBlocks(Vec<(TablePtr, BlockNumber)>); + +impl LatestBlocks { + pub(super) async fn load(cx: &Context) -> Result + where + NC: Client, + { + debug!(cx.logger, "Loading latest blocks"); + + let latest_block_futs = cx + .manifest + .data_sources + .iter() + .enumerate() + .map(|(i, data_source)| { + data_source + .source + .tables + .iter() + .enumerate() + .map(move |(j, table)| ((i, j), &data_source.source.dataset, table)) + }) + .flatten() + .unique_by(|(_, dataset, table)| (dataset.cheap_clone(), table.cheap_clone())) + .map(|(table_ptr, dataset, table)| { + latest_block(&cx, dataset, table) + .map_ok(move |latest_block| (table_ptr, latest_block)) + .map_err(move |e| { + e.context(format!( + "failed to load latest block for '{dataset}.{table}'" + )) + }) + }); + + try_join_all(latest_block_futs).await.map(Self) + } + + pub(super) fn filter_completed(self, cx: &Context) -> Self + where + NC: Client, + { + let latest_synced_block = cx.latest_synced_block(); + + Self( + self.0 + .into_iter() + .filter(|((i, _), _)| { + !indexing_completed(&cx.manifest.data_sources[*i], &latest_synced_block) + }) + .collect(), + ) + } + + pub(super) fn min(&self) -> BlockNumber { + self.0 + .iter() + .min_by_key(|(_, latest_block)| *latest_block) + .map(|(_, latest_block)| *latest_block) + .unwrap() + } + + pub(super) async fn changed(self, cx: &Context) -> Result<(), Error> + where + NC: Client, + { + debug!(cx.logger, "Waiting for new blocks"); + + let min_latest_block = self.min(); + let latest_synced_block = cx.latest_synced_block(); + + let latest_block_changed_futs = self + .0 + .into_iter() + .filter(|(_, latest_block)| *latest_block == min_latest_block) + .filter(|((i, _), _)| { + !indexing_completed(&cx.manifest.data_sources[*i], &latest_synced_block) + }) + .map(|((i, j), latest_block)| { + let source = &cx.manifest.data_sources[i].source; + let dataset = &source.dataset; + let table = &source.tables[j]; + + latest_block_changed(&cx, dataset, table, latest_block).map_err(move |e| { + e.context(format!( + "failed to check if the latest block changed in '{dataset}.{table}'" + )) + }) + }); + + let _response = try_join_all(latest_block_changed_futs).await?; + + Ok(()) + } + + pub(super) fn iter(&self) -> impl Iterator { + self.0.iter() + } +} + +fn indexing_completed(data_source: &DataSource, latest_synced_block: &Option) -> bool { + latest_synced_block + .as_ref() + .is_some_and(|latest_synced_block| *latest_synced_block >= data_source.source.end_block) +} + +async fn latest_block( + cx: &Context, + dataset: &Ident, + table: &Ident, +) -> Result +where + NC: Client, +{ + let query = format!("SELECT MAX(_block_num) FROM {dataset}.{table}"); + let stream = cx.client.query(&cx.logger, query, None); + let record_batch = read_once(stream).await?; + + let latest_block = block_number_decoder(&record_batch, 0) + .map_err(|e| Error::Deterministic(e))? + .decode(0) + .map_err(|e| Error::Deterministic(e))? + .ok_or_else(|| Error::NonDeterministic(anyhow!("table is empty")))?; + + Ok(latest_block) +} + +async fn latest_block_changed( + cx: &Context, + dataset: &Ident, + table: &Ident, + latest_block: BlockNumber, +) -> Result<(), Error> +where + NC: Client, +{ + let query = format!("SELECT _block_num FROM {dataset}.{table} WHERE _block_num > {latest_block} SETTINGS stream = true"); + let stream = cx.client.query(&cx.logger, query, None); + let _record_batch = read_once(stream).await?; + + Ok(()) +} + +async fn read_once( + mut stream: BoxStream<'static, Result>, +) -> Result +where + E: std::error::Error + IsDeterministic + Send + Sync + 'static, +{ + let response = stream + .next() + .await + .ok_or_else(|| Error::NonDeterministic(anyhow!("stream is empty")))? + .map_err(Error::from)?; + + match response { + ResponseBatch::Batch { data } => Ok(data), + _ => Err(Error::NonDeterministic(anyhow!("response is empty"))), + } +} diff --git a/core/src/nozzle_subgraph/runner/mod.rs b/core/src/nozzle_subgraph/runner/mod.rs new file mode 100644 index 00000000000..c7088b56a11 --- /dev/null +++ b/core/src/nozzle_subgraph/runner/mod.rs @@ -0,0 +1,140 @@ +mod compat; +mod context; +mod data_processing; +mod data_stream; +mod error; +mod latest_blocks; +mod reorg_handler; + +use anyhow::Result; +use futures::{future::BoxFuture, StreamExt}; +use graph::{ + cheap_clone::CheapClone, components::store::EntityCache, data::subgraph::schema::SubgraphError, + nozzle::Client, +}; +use slog::{debug, error, warn}; +use tokio_util::sync::CancellationToken; + +use self::{ + compat::Compat, data_processing::process_record_batch_groups, data_stream::new_data_stream, + error::Error, latest_blocks::LatestBlocks, reorg_handler::check_and_handle_reorg, +}; + +pub(super) use self::context::Context; + +pub(super) fn new_runner( + mut cx: Context, +) -> Box BoxFuture<'static, Result<()>> + Send + 'static> +where + NC: Client + Send + Sync + 'static, +{ + Box::new(move |cancel_token| { + Box::pin(async move { + match cancel_token + .run_until_cancelled(run_indexing_with_retries(&mut cx)) + .await + { + Some(result) => result?, + None => { + debug!(cx.logger, "Processed cancel signal"); + } + } + + debug!(cx.logger, "Waiting for the store to finish processing"); + cx.store.flush().await?; + Ok(()) + }) + }) +} + +async fn run_indexing(cx: &mut Context) -> Result<(), Error> +where + NC: Client, +{ + loop { + debug!(cx.logger, "Running indexing"; + "latest_synced_block_ptr" => ?cx.latest_synced_block_ptr() + ); + + let mut latest_blocks = LatestBlocks::load(cx).await?; + check_and_handle_reorg(cx, &latest_blocks).await?; + + if cx.indexing_completed() { + debug!(cx.logger, "Indexing completed"); + return Ok(()); + } + + latest_blocks = latest_blocks.filter_completed(cx); + let latest_block = latest_blocks.min(); + + let mut deployment_is_failed = cx.store.health().await?.is_failed(); + let mut entity_cache = EntityCache::new(cx.store.cheap_clone()); + let mut stream = new_data_stream(cx, latest_block); + + while let Some(result) = stream.next().await { + let (record_batch_groups, stream_table_ptr) = result?; + + entity_cache = process_record_batch_groups( + cx, + entity_cache, + record_batch_groups, + stream_table_ptr, + latest_block, + ) + .await?; + + if deployment_is_failed { + if let Some(block_ptr) = cx.store.block_ptr() { + cx.store.unfail_non_deterministic_error(&block_ptr)?; + deployment_is_failed = false; + } + } + } + + debug!(cx.logger, "Completed indexing iteration"; + "latest_synced_block_ptr" => ?cx.latest_synced_block_ptr() + ); + + latest_blocks.changed(cx).await?; + cx.backoff.reset(); + } +} + +async fn run_indexing_with_retries(cx: &mut Context) -> Result<()> +where + NC: Client, +{ + loop { + match run_indexing(cx).await { + Ok(()) => return Ok(()), + Err(e) => { + let deterministic = e.is_deterministic(); + + cx.store + .fail_subgraph(SubgraphError { + subgraph_id: cx.deployment.cheap_clone(), + message: format!("{e:#}"), + block_ptr: None, // TODO: Find a way to propagate the block ptr here + handler: None, + deterministic, + }) + .await?; + + if deterministic { + error!(cx.logger, "Subgraph failed with a deterministic error"; + "e" => ?e + ); + return Err(e.into()); + } + + warn!(cx.logger, "Subgraph failed with a non-deterministic error"; + "e" => ?e, + "retry_delay_seconds" => cx.backoff.delay().as_secs() + ); + + cx.backoff.sleep_async().await; + debug!(cx.logger, "Restarting indexing"); + } + } + } +} diff --git a/core/src/nozzle_subgraph/runner/reorg_handler.rs b/core/src/nozzle_subgraph/runner/reorg_handler.rs new file mode 100644 index 00000000000..e18886e6168 --- /dev/null +++ b/core/src/nozzle_subgraph/runner/reorg_handler.rs @@ -0,0 +1,162 @@ +use alloy::primitives::{BlockHash, BlockNumber}; +use anyhow::anyhow; +use futures::{future::try_join_all, StreamExt, TryFutureExt}; +use graph::{ + blockchain::block_stream::FirehoseCursor, + nozzle::{ + client::{LatestBlockBeforeReorg, RequestMetadata, ResponseBatch, ResumeStreamingQuery}, + common::Ident, + Client, + }, +}; +use itertools::Itertools; +use slog::debug; + +use super::{Compat, Context, Error, LatestBlocks}; + +pub(super) async fn check_and_handle_reorg( + cx: &Context, + latest_blocks: &LatestBlocks, +) -> Result<(), Error> +where + NC: Client, +{ + let logger = cx + .logger + .new(slog::o!("process" => "check_and_handle_reorg")); + + let Some((latest_synced_block_number, latest_synced_block_hash)) = cx.latest_synced_block_ptr() + else { + debug!(logger, "There are no synced blocks; Skipping reorg check"); + return Ok(()); + }; + + debug!(logger, "Running reorg check"); + + let Some(latest_block_before_reorg) = detect_deepest_reorg( + cx, + latest_blocks, + latest_synced_block_number, + latest_synced_block_hash, + ) + .await? + else { + debug!(logger, "Successfully checked for reorg: No reorg detected"; + "latest_synced_block" => latest_synced_block_number + ); + return Ok(()); + }; + + debug!(logger, "Handling reorg"; + "latest_synced_block" => latest_synced_block_number, + "latest_block_before_reorg" => ?latest_block_before_reorg.block_number + ); + + let (block_number, block_hash) = match ( + latest_block_before_reorg.block_number, + latest_block_before_reorg.block_hash, + ) { + (Some(block_number), Some(block_hash)) => (block_number, block_hash), + (_, _) => { + // TODO: Handle reorgs to the genesis block + return Err(Error::Deterministic(anyhow!( + "invalid reorg: rewind to the genesis block not supported" + ))); + } + }; + + if block_number > latest_synced_block_number { + return Err(Error::Deterministic(anyhow!( + "invalid reorg: latest block before reorg cannot be higher than the invalidated block" + ))); + } else if block_number == latest_synced_block_number && block_hash == latest_synced_block_hash { + return Err(Error::Deterministic(anyhow!( + "invalid reorg: latest block before reorg cannot be equal to the invalidated block" + ))); + } + + cx.store + .revert_block_operations((block_number, block_hash).compat(), FirehoseCursor::None) + .await + .map_err(Error::from)?; + + Ok(()) +} + +async fn detect_deepest_reorg( + cx: &Context, + latest_blocks: &LatestBlocks, + latest_synced_block_number: BlockNumber, + latest_synced_block_hash: BlockHash, +) -> Result, Error> +where + NC: Client, +{ + let detect_reorg_futs = latest_blocks + .iter() + .filter(|(_, latest_block)| *latest_block >= latest_synced_block_number) + .map(|((i, j), _)| { + let data_source = &cx.manifest.data_sources[*i]; + let network = &data_source.network; + let dataset = &data_source.source.dataset; + let table = &data_source.source.tables[*j]; + + detect_reorg( + &cx, + network, + dataset, + table, + latest_synced_block_number, + latest_synced_block_hash, + ) + .map_err(move |e| e.context(format!("failed to detect reorg in '{dataset}.{table}'"))) + }); + + let deepest_reorg = try_join_all(detect_reorg_futs) + .await? + .into_iter() + .flatten() + .min_by_key(|latest_block_before_reorg| latest_block_before_reorg.block_number); + + Ok(deepest_reorg) +} + +async fn detect_reorg( + cx: &Context, + network: &str, + dataset: &Ident, + table: &Ident, + latest_synced_block_number: BlockNumber, + latest_synced_block_hash: BlockHash, +) -> Result, Error> +where + NC: Client, +{ + let query = format!("SELECT _block_num FROM {dataset}.{table} SETTINGS stream = true"); + let mut stream = cx.client.query( + &cx.logger, + query, + Some(RequestMetadata { + resume_streaming_query: Some(vec![ResumeStreamingQuery { + network: network.to_string(), + block_number: latest_synced_block_number, + block_hash: latest_synced_block_hash, + }]), + }), + ); + + let response = stream + .next() + .await + .ok_or_else(|| Error::NonDeterministic(anyhow!("stream is empty")))? + .map_err(Error::from)?; + + match response { + ResponseBatch::Batch { .. } => Ok(None), + ResponseBatch::Reorg(reorg) => reorg + .into_iter() + .exactly_one() + .map_err(|_e| Error::Deterministic(anyhow!("multi-chain datasets are not supported"))) + .map(Some), + } +} diff --git a/gnd/Cargo.toml b/gnd/Cargo.toml index 80966f9bfa4..0a1fa91aa18 100644 --- a/gnd/Cargo.toml +++ b/gnd/Cargo.toml @@ -20,6 +20,7 @@ env_logger = "0.11.8" git-testament = "0.2" lazy_static = "1.5.0" tokio = { workspace = true } +tokio-util.workspace = true serde = { workspace = true } # File watching @@ -29,4 +30,4 @@ pq-sys = { version = "0.7.2", features = ["bundled"] } openssl-sys = { version = "0.9.100", features = ["vendored"] } [target.'cfg(unix)'.dependencies] -pgtemp = { git = "https://github.com/graphprotocol/pgtemp", branch = "initdb-args" } \ No newline at end of file +pgtemp = { git = "https://github.com/graphprotocol/pgtemp", branch = "initdb-args" } diff --git a/gnd/src/main.rs b/gnd/src/main.rs index 4c34a59317e..0e8e42238a3 100644 --- a/gnd/src/main.rs +++ b/gnd/src/main.rs @@ -14,6 +14,7 @@ use graph::{ use graph_core::polling_monitor::ipfs_service; use graph_node::{launcher, opt::Opt}; use lazy_static::lazy_static; +use tokio_util::sync::CancellationToken; use gnd::watcher::{deploy_all_subgraphs, parse_manifest_args, watch_subgraphs}; @@ -159,6 +160,7 @@ async fn run_graph_node( opt: Opt, link_resolver: Arc, subgraph_updates_channel: mpsc::Receiver<(DeploymentHash, SubgraphName)>, + cancel_token: CancellationToken, ) -> Result<()> { let env_vars = Arc::new(EnvVars::from_env().context("Failed to load environment variables")?); @@ -184,6 +186,7 @@ async fn run_graph_node( Some(subgraph_updates_channel), prometheus_registry, metrics_registry, + cancel_token, ) .await; Ok(()) @@ -237,6 +240,7 @@ async fn main() -> Result<()> { let database_dir = Path::new(&dev_opt.database_dir); + let cancel_token = shutdown_token(); let logger = logger(true); info!(logger, "Starting Graph Node Dev 1"); @@ -256,7 +260,7 @@ async fn main() -> Result<()> { let logger_clone = logger.clone(); graph::spawn(async move { - let _ = run_graph_node(&logger_clone, opt, file_link_resolver, rx).await; + let _ = run_graph_node(&logger_clone, opt, file_link_resolver, rx, cancel_token).await; }); if let Err(e) = @@ -302,3 +306,39 @@ async fn main() -> Result<()> { #[allow(unreachable_code)] Ok(()) } + +fn shutdown_token() -> CancellationToken { + use tokio::signal; + + let cancel_token = CancellationToken::new(); + let cancel_token_clone = cancel_token.clone(); + + async fn shutdown_signal_handler() { + let ctrl_c = async { + signal::ctrl_c().await.unwrap(); + }; + + #[cfg(unix)] + let terminate = async { + signal::unix::signal(signal::unix::SignalKind::terminate()) + .unwrap() + .recv() + .await; + }; + + #[cfg(not(unix))] + let terminate = std::future::pending::<()>(); + + tokio::select! { + _ = ctrl_c => {}, + _ = terminate => {}, + }; + } + + tokio::spawn(async move { + shutdown_signal_handler().await; + cancel_token_clone.cancel(); + }); + + cancel_token +} diff --git a/graph/Cargo.toml b/graph/Cargo.toml index ac16e60d853..a7f34b0970e 100644 --- a/graph/Cargo.toml +++ b/graph/Cargo.toml @@ -111,6 +111,7 @@ arrow.workspace = true heck.workspace = true lazy-regex.workspace = true sqlparser-latest.workspace = true +tokio-util.workspace = true [dev-dependencies] clap.workspace = true diff --git a/graph/src/blockchain/types.rs b/graph/src/blockchain/types.rs index 081fff4eea5..c64da4f4f7a 100644 --- a/graph/src/blockchain/types.rs +++ b/graph/src/blockchain/types.rs @@ -611,6 +611,12 @@ impl From for BlockTime { } } +impl From for BlockTime { + fn from(value: Timestamp) -> Self { + Self(value) + } +} + impl From for Value { fn from(block_time: BlockTime) -> Self { Value::Timestamp(block_time.0) diff --git a/graph/src/cheap_clone.rs b/graph/src/cheap_clone.rs index b8863d3918e..fc9c98ab7d1 100644 --- a/graph/src/cheap_clone.rs +++ b/graph/src/cheap_clone.rs @@ -106,6 +106,7 @@ cheap_clone_is_clone!(Channel); // reqwest::Client uses Arc internally, so it is CheapClone. cheap_clone_is_clone!(reqwest::Client); cheap_clone_is_clone!(slog::Logger); +cheap_clone_is_clone!(semver::Version); cheap_clone_is_copy!( (), @@ -119,3 +120,5 @@ cheap_clone_is_copy!( std::time::Duration ); cheap_clone_is_copy!(ethabi::Address); + +cheap_clone_is_clone!(tokio_util::sync::CancellationToken); diff --git a/graph/src/components/store/err.rs b/graph/src/components/store/err.rs index 446b73408f1..627320bdc76 100644 --- a/graph/src/components/store/err.rs +++ b/graph/src/components/store/err.rs @@ -247,3 +247,9 @@ impl From for StoreError { StoreError::Unknown(anyhow!("{}", e.to_string())) } } + +impl crate::nozzle::error::IsDeterministic for StoreError { + fn is_deterministic(&self) -> bool { + StoreError::is_deterministic(self) + } +} diff --git a/node/src/launcher.rs b/node/src/launcher.rs index b8b27a31a1a..1cb6558db1b 100644 --- a/node/src/launcher.rs +++ b/node/src/launcher.rs @@ -32,6 +32,7 @@ use graph_store_postgres::{ use graphman_server::GraphmanServer; use graphman_server::GraphmanServerConfig; use tokio::sync::mpsc; +use tokio_util::sync::CancellationToken; use crate::config::Config; use crate::helpers::watch_subgraph_updates; @@ -270,6 +271,7 @@ fn build_subgraph_registrar( arweave_service: ArweaveService, ipfs_service: IpfsService, nozzle_client: Option>, + cancel_token: CancellationToken, ) -> Arc< graph_core::subgraph::SubgraphRegistrar< graph_core::subgraph_provider::SubgraphProvider, @@ -284,6 +286,26 @@ where let static_filters = ENV_VARS.experimental_static_filters; let sg_count = Arc::new(SubgraphCountMetric::new(metrics_registry.cheap_clone())); + let mut subgraph_instance_managers = + graph_core::subgraph_provider::SubgraphInstanceManagers::new(); + + if let Some(nozzle_client) = nozzle_client.cheap_clone() { + let nozzle_instance_manager = graph_core::nozzle_subgraph::Manager::new( + &logger_factory, + metrics_registry.cheap_clone(), + env_vars.cheap_clone(), + &cancel_token, + network_store.subgraph_store(), + link_resolver.cheap_clone(), + nozzle_client, + ); + + subgraph_instance_managers.add( + graph_core::subgraph_provider::SubgraphProcessingKind::Amp, + Arc::new(nozzle_instance_manager), + ); + } + let subgraph_instance_manager = graph_core::subgraph::SubgraphInstanceManager::new( &logger_factory, env_vars.cheap_clone(), @@ -298,9 +320,6 @@ where static_filters, ); - let mut subgraph_instance_managers = - graph_core::subgraph_provider::SubgraphInstanceManagers::new(); - subgraph_instance_managers.add( graph_core::subgraph_provider::SubgraphProcessingKind::Trigger, Arc::new(subgraph_instance_manager), @@ -379,6 +398,7 @@ pub async fn run( dev_updates: Option>, prometheus_registry: Arc, metrics_registry: Arc, + cancel_token: CancellationToken, ) { // Log version information info!( @@ -555,6 +575,7 @@ pub async fn run( arweave_service, ipfs_service, nozzle_client, + cancel_token, ); graph::spawn( diff --git a/node/src/main.rs b/node/src/main.rs index 1bc9a0329c9..8742e097a34 100644 --- a/node/src/main.rs +++ b/node/src/main.rs @@ -26,7 +26,7 @@ fn main() { async fn main_inner() { env_logger::init(); - let _cancel_token = shutdown_token(); + let cancel_token = shutdown_token(); let env_vars = Arc::new(EnvVars::from_env().unwrap()); let opt = opt::Opt::parse(); @@ -61,6 +61,7 @@ async fn main_inner() { None, prometheus_registry, metrics_registry, + cancel_token, ) .await; } diff --git a/node/src/manager/commands/run.rs b/node/src/manager/commands/run.rs index e27f94d16d4..9dd8f1bc39c 100644 --- a/node/src/manager/commands/run.rs +++ b/node/src/manager/commands/run.rs @@ -23,6 +23,7 @@ use graph::prelude::{ }; use graph::slog::{debug, info, Logger}; use graph_core::polling_monitor::{arweave_service, ipfs_service}; +use tokio_util::sync::CancellationToken; fn locate(store: &dyn SubgraphStore, hash: &str) -> Result { let mut locators = store.locators(hash)?; @@ -51,6 +52,7 @@ pub async fn run( subgraph, stop_block ); + let cancel_token = CancellationToken::new(); let env_vars = Arc::new(EnvVars::from_env().unwrap()); let metrics_registry = metrics_ctx.registry.clone(); let logger_factory = LoggerFactory::new(logger.clone(), None, metrics_ctx.registry.clone()); @@ -139,17 +141,37 @@ pub async fn run( let static_filters = ENV_VARS.experimental_static_filters; let sg_metrics = Arc::new(SubgraphCountMetric::new(metrics_registry.clone())); + let mut subgraph_instance_managers = + graph_core::subgraph_provider::SubgraphInstanceManagers::new(); + let nozzle_client = match nozzle_flight_service_address { Some(nozzle_flight_service_address) => { let addr = nozzle_flight_service_address .parse() .expect("Invalid Nozzle Flight service address"); - let nozzle_client = nozzle::FlightClient::new(addr) - .await - .expect("Failed to connect to Nozzle Flight service"); + let nozzle_client = Arc::new( + nozzle::FlightClient::new(addr) + .await + .expect("Failed to connect to Nozzle Flight service"), + ); - Some(Arc::new(nozzle_client)) + let nozzle_instance_manager = graph_core::nozzle_subgraph::Manager::new( + &logger_factory, + metrics_registry.cheap_clone(), + env_vars.cheap_clone(), + &cancel_token, + network_store.subgraph_store(), + link_resolver.cheap_clone(), + nozzle_client.cheap_clone(), + ); + + subgraph_instance_managers.add( + graph_core::subgraph_provider::SubgraphProcessingKind::Amp, + Arc::new(nozzle_instance_manager), + ); + + Some(nozzle_client) } None => None, }; @@ -168,9 +190,6 @@ pub async fn run( static_filters, ); - let mut subgraph_instance_managers = - graph_core::subgraph_provider::SubgraphInstanceManagers::new(); - subgraph_instance_managers.add( graph_core::subgraph_provider::SubgraphProcessingKind::Trigger, Arc::new(subgraph_instance_manager), @@ -180,7 +199,7 @@ pub async fn run( &logger_factory, sg_metrics.cheap_clone(), link_resolver.cheap_clone(), - tokio_util::sync::CancellationToken::new(), + cancel_token.clone(), subgraph_instance_managers, )); From 2df58abcab37584b31df173250882ecf1b5a15e2 Mon Sep 17 00:00:00 2001 From: Ion Suman <47307091+isum@users.noreply.github.com> Date: Wed, 29 Oct 2025 12:20:42 +0200 Subject: [PATCH 21/40] chore(all): rename Nozzle to Amp --- Cargo.toml | 2 +- chain/ethereum/src/runtime/runtime_adapter.rs | 2 +- .../manager.rs | 30 +++---- .../metrics.rs | 2 +- .../{nozzle_subgraph => amp_subgraph}/mod.rs | 0 .../monitor.rs | 4 +- .../runner/compat.rs | 0 .../runner/context.rs | 18 ++-- .../runner/data_processing.rs | 20 ++--- .../runner/data_stream.rs | 18 ++-- .../runner/error.rs | 2 +- .../runner/latest_blocks.rs | 28 +++--- .../runner/mod.rs | 18 ++-- .../runner/reorg_handler.rs | 22 ++--- core/src/lib.rs | 2 +- core/src/subgraph/context/instance/mod.rs | 2 +- core/src/subgraph/instance_manager.rs | 16 ++-- core/src/subgraph/registrar.rs | 30 +++---- core/src/subgraph_provider.rs | 4 +- graph/Cargo.toml | 2 +- .../{nozzle => amp}/client/flight_client.rs | 18 ++-- graph/src/{nozzle => amp}/client/mod.rs | 12 +-- .../{nozzle => amp}/codec/array_decoder.rs | 2 +- graph/src/{nozzle => amp}/codec/decoder.rs | 2 +- .../src/{nozzle => amp}/codec/list_decoder.rs | 0 .../{nozzle => amp}/codec/mapping_decoder.rs | 0 graph/src/{nozzle => amp}/codec/mod.rs | 4 +- graph/src/{nozzle => amp}/codec/name_cache.rs | 4 +- graph/src/{nozzle => amp}/codec/utils.rs | 2 +- .../{nozzle => amp}/codec/value_decoder.rs | 4 +- graph/src/{nozzle => amp}/common/ident.rs | 10 +-- graph/src/{nozzle => amp}/common/mod.rs | 6 +- graph/src/{nozzle => amp}/error.rs | 0 graph/src/{nozzle => amp}/log.rs | 2 +- .../manifest/data_source/mod.rs | 18 ++-- .../manifest/data_source/raw.rs | 63 ++++++------- graph/src/{nozzle => amp}/manifest/mod.rs | 30 +++---- graph/src/{nozzle => amp}/mod.rs | 2 +- .../schema/generator/entity.rs | 16 ++-- .../{nozzle => amp}/schema/generator/mod.rs | 8 +- graph/src/{nozzle => amp}/schema/mod.rs | 0 graph/src/{nozzle => amp}/sql/mod.rs | 0 .../sql/query/filter_blocks.rs | 2 +- graph/src/{nozzle => amp}/sql/query/mod.rs | 12 +-- .../sql/query/resolve_event_signatures.rs | 2 +- .../sql/query/resolve_source_address.rs | 0 .../sql/query/validate_tables.rs | 2 +- .../stream_aggregator/error.rs | 2 +- .../{nozzle => amp}/stream_aggregator/mod.rs | 6 +- .../record_batch/aggregator.rs | 0 .../stream_aggregator/record_batch/buffer.rs | 0 .../stream_aggregator/record_batch/decoder.rs | 2 +- .../record_batch/group_data.rs | 88 +++++++++++++++++++ .../stream_aggregator/record_batch/mod.rs | 0 graph/src/blockchain/mod.rs | 2 +- graph/src/components/store/err.rs | 2 +- graph/src/data/subgraph/mod.rs | 58 ++++++------ graph/src/data_source/mod.rs | 76 ++++++++-------- graph/src/data_source/subgraph.rs | 26 +++--- graph/src/env/{nozzle.rs => amp.rs} | 20 ++--- graph/src/env/mod.rs | 24 ++--- graph/src/lib.rs | 2 +- node/src/bin/manager.rs | 8 +- node/src/launcher.rs | 40 ++++----- node/src/manager/commands/run.rs | 30 +++---- node/src/opt.rs | 6 +- runtime/wasm/src/host.rs | 4 +- server/index-node/src/resolver.rs | 28 +++--- server/index-node/src/server.rs | 16 ++-- server/index-node/src/service.rs | 16 ++-- .../tests/chain/ethereum/manifest.rs | 20 ++--- tests/src/fixture/mod.rs | 9 +- 72 files changed, 508 insertions(+), 420 deletions(-) rename core/src/{nozzle_subgraph => amp_subgraph}/manager.rs (86%) rename core/src/{nozzle_subgraph => amp_subgraph}/metrics.rs (96%) rename core/src/{nozzle_subgraph => amp_subgraph}/mod.rs (100%) rename core/src/{nozzle_subgraph => amp_subgraph}/monitor.rs (99%) rename core/src/{nozzle_subgraph => amp_subgraph}/runner/compat.rs (100%) rename core/src/{nozzle_subgraph => amp_subgraph}/runner/context.rs (88%) rename core/src/{nozzle_subgraph => amp_subgraph}/runner/data_processing.rs (96%) rename core/src/{nozzle_subgraph => amp_subgraph}/runner/data_stream.rs (96%) rename core/src/{nozzle_subgraph => amp_subgraph}/runner/error.rs (96%) rename core/src/{nozzle_subgraph => amp_subgraph}/runner/latest_blocks.rs (92%) rename core/src/{nozzle_subgraph => amp_subgraph}/runner/mod.rs (91%) rename core/src/{nozzle_subgraph => amp_subgraph}/runner/reorg_handler.rs (95%) rename graph/src/{nozzle => amp}/client/flight_client.rs (95%) rename graph/src/{nozzle => amp}/client/mod.rs (88%) rename graph/src/{nozzle => amp}/codec/array_decoder.rs (99%) rename graph/src/{nozzle => amp}/codec/decoder.rs (93%) rename graph/src/{nozzle => amp}/codec/list_decoder.rs (100%) rename graph/src/{nozzle => amp}/codec/mapping_decoder.rs (100%) rename graph/src/{nozzle => amp}/codec/mod.rs (98%) rename graph/src/{nozzle => amp}/codec/name_cache.rs (90%) rename graph/src/{nozzle => amp}/codec/utils.rs (98%) rename graph/src/{nozzle => amp}/codec/value_decoder.rs (99%) rename graph/src/{nozzle => amp}/common/ident.rs (95%) rename graph/src/{nozzle => amp}/common/mod.rs (76%) rename graph/src/{nozzle => amp}/error.rs (100%) rename graph/src/{nozzle => amp}/log.rs (87%) rename graph/src/{nozzle => amp}/manifest/data_source/mod.rs (89%) rename graph/src/{nozzle => amp}/manifest/data_source/raw.rs (91%) rename graph/src/{nozzle => amp}/manifest/mod.rs (71%) rename graph/src/{nozzle => amp}/mod.rs (76%) rename graph/src/{nozzle => amp}/schema/generator/entity.rs (91%) rename graph/src/{nozzle => amp}/schema/generator/mod.rs (88%) rename graph/src/{nozzle => amp}/schema/mod.rs (100%) rename graph/src/{nozzle => amp}/sql/mod.rs (100%) rename graph/src/{nozzle => amp}/sql/query/filter_blocks.rs (98%) rename graph/src/{nozzle => amp}/sql/query/mod.rs (94%) rename graph/src/{nozzle => amp}/sql/query/resolve_event_signatures.rs (98%) rename graph/src/{nozzle => amp}/sql/query/resolve_source_address.rs (100%) rename graph/src/{nozzle => amp}/sql/query/validate_tables.rs (98%) rename graph/src/{nozzle => amp}/stream_aggregator/error.rs (96%) rename graph/src/{nozzle => amp}/stream_aggregator/mod.rs (97%) rename graph/src/{nozzle => amp}/stream_aggregator/record_batch/aggregator.rs (100%) rename graph/src/{nozzle => amp}/stream_aggregator/record_batch/buffer.rs (100%) rename graph/src/{nozzle => amp}/stream_aggregator/record_batch/decoder.rs (98%) create mode 100644 graph/src/amp/stream_aggregator/record_batch/group_data.rs rename graph/src/{nozzle => amp}/stream_aggregator/record_batch/mod.rs (100%) rename graph/src/env/{nozzle.rs => amp.rs} (82%) diff --git a/Cargo.toml b/Cargo.toml index 351063d4b34..a2261f4c2a4 100644 --- a/Cargo.toml +++ b/Cargo.toml @@ -98,7 +98,7 @@ substreams-entity-change = "2" substreams-near-core = "=0.10.2" rand = { version = "0.9.2", features = ["os_rng"] } -# Dependencies related to Nozzle Subgraphs +# Dependencies related to Amp subgraphs ahash = "0.8.11" alloy = { version = "1.0.12", default-features = false, features = ["json-abi", "serde"] } arrow = { version = "=55.0.0" } diff --git a/chain/ethereum/src/runtime/runtime_adapter.rs b/chain/ethereum/src/runtime/runtime_adapter.rs index 3a6103fc177..acbf41c62a3 100644 --- a/chain/ethereum/src/runtime/runtime_adapter.rs +++ b/chain/ethereum/src/runtime/runtime_adapter.rs @@ -182,7 +182,7 @@ impl blockchain::RuntimeAdapter for RuntimeAdapter { create_host_fns(abis, archive, call_cache, eth_adapters, eth_call_gas) } data_source::DataSource::Offchain(_) => vec![], - data_source::DataSource::Nozzle(_) => vec![], + data_source::DataSource::Amp(_) => vec![], }; Ok(host_fns) diff --git a/core/src/nozzle_subgraph/manager.rs b/core/src/amp_subgraph/manager.rs similarity index 86% rename from core/src/nozzle_subgraph/manager.rs rename to core/src/amp_subgraph/manager.rs index 5f757ce71a6..041a30df226 100644 --- a/core/src/nozzle_subgraph/manager.rs +++ b/core/src/amp_subgraph/manager.rs @@ -4,6 +4,7 @@ use alloy::primitives::BlockNumber; use anyhow::Context; use async_trait::async_trait; use graph::{ + amp, components::{ link_resolver::{LinkResolver, LinkResolverContext}, metrics::MetricsRegistry, @@ -12,7 +13,6 @@ use graph::{ }, env::EnvVars, log::factory::LoggerFactory, - nozzle, prelude::CheapClone, }; use slog::{debug, error}; @@ -20,10 +20,10 @@ use tokio_util::sync::CancellationToken; use super::{runner, Metrics, Monitor}; -/// Manages Nozzle subgraph runner futures. +/// Manages Amp subgraph runner futures. /// -/// Creates and schedules Nozzle subgraph runner futures for execution on demand. -/// Also handles stopping previously started Nozzle subgraph runners. +/// Creates and schedules Amp subgraph runner futures for execution on demand. +/// Also handles stopping previously started Amp subgraph runners. pub struct Manager { logger_factory: LoggerFactory, metrics_registry: Arc, @@ -31,15 +31,15 @@ pub struct Manager { monitor: Monitor, subgraph_store: Arc, link_resolver: Arc, - nozzle_client: Arc, + amp_client: Arc, } impl Manager where SS: SubgraphStore, - NC: nozzle::Client, + NC: amp::Client, { - /// Creates a new Nozzle subgraph manager. + /// Creates a new Amp subgraph manager. pub fn new( logger_factory: &LoggerFactory, metrics_registry: Arc, @@ -47,9 +47,9 @@ where cancel_token: &CancellationToken, subgraph_store: Arc, link_resolver: Arc, - nozzle_client: Arc, + amp_client: Arc, ) -> Self { - let logger = logger_factory.component_logger("NozzleSubgraphManager", None); + let logger = logger_factory.component_logger("AmpSubgraphManager", None); let logger_factory = logger_factory.with_parent(logger); let monitor = Monitor::new(&logger_factory, cancel_token); @@ -61,7 +61,7 @@ where monitor, subgraph_store, link_resolver, - nozzle_client, + amp_client, } } } @@ -70,7 +70,7 @@ where impl SubgraphInstanceManager for Manager where SS: SubgraphStore, - NC: nozzle::Client + Send + Sync + 'static, + NC: amp::Client + Send + Sync + 'static, { async fn start_subgraph( self: Arc, @@ -115,10 +115,10 @@ where let raw_manifest = serde_yaml::from_slice(&manifest_bytes) .context("failed to parse subgraph manifest")?; - let mut manifest = nozzle::Manifest::resolve::( + let mut manifest = amp::Manifest::resolve::( &logger, manager.link_resolver.cheap_clone(), - manager.nozzle_client.cheap_clone(), + manager.amp_client.cheap_clone(), manager.env_vars.max_spec_version.cheap_clone(), deployment.hash.cheap_clone(), raw_manifest, @@ -138,8 +138,8 @@ where let runner_context = runner::Context::new( &logger, - &manager.env_vars.nozzle, - manager.nozzle_client.cheap_clone(), + &manager.env_vars.amp, + manager.amp_client.cheap_clone(), store, deployment.hash.cheap_clone(), manifest, diff --git a/core/src/nozzle_subgraph/metrics.rs b/core/src/amp_subgraph/metrics.rs similarity index 96% rename from core/src/nozzle_subgraph/metrics.rs rename to core/src/amp_subgraph/metrics.rs index 6bdb266b337..f89e85acfdd 100644 --- a/core/src/nozzle_subgraph/metrics.rs +++ b/core/src/amp_subgraph/metrics.rs @@ -26,7 +26,7 @@ impl Metrics { let stopwatch = StopwatchMetrics::new( logger.cheap_clone(), deployment, - "nozzle-process", + "amp-process", metrics_registry, store.shard().to_string(), ); diff --git a/core/src/nozzle_subgraph/mod.rs b/core/src/amp_subgraph/mod.rs similarity index 100% rename from core/src/nozzle_subgraph/mod.rs rename to core/src/amp_subgraph/mod.rs diff --git a/core/src/nozzle_subgraph/monitor.rs b/core/src/amp_subgraph/monitor.rs similarity index 99% rename from core/src/nozzle_subgraph/monitor.rs rename to core/src/amp_subgraph/monitor.rs index 0d49b8cc1e2..cfa1de2942d 100644 --- a/core/src/nozzle_subgraph/monitor.rs +++ b/core/src/amp_subgraph/monitor.rs @@ -92,7 +92,7 @@ impl Monitor { /// A new cancel token is derived from the `cancel_token` and only the derived token is used by the /// subgraph monitor and its background process. pub(super) fn new(logger_factory: &LoggerFactory, cancel_token: &CancellationToken) -> Self { - let logger = logger_factory.component_logger("NozzleSubgraphMonitor", None); + let logger = logger_factory.component_logger("AmpSubgraphMonitor", None); let logger_factory = Arc::new(logger_factory.with_parent(logger)); // A derived token makes sure it is not possible to accidentally cancel the parent token @@ -221,7 +221,7 @@ impl Monitor { _ = cancel_token.cancelled() => { debug!(logger, "Stopping command processor"); - // All active Subgraphs will shutdown gracefully + // All active subgraphs will shutdown gracefully // because their cancel tokens are derived from this cancelled token. return; } diff --git a/core/src/nozzle_subgraph/runner/compat.rs b/core/src/amp_subgraph/runner/compat.rs similarity index 100% rename from core/src/nozzle_subgraph/runner/compat.rs rename to core/src/amp_subgraph/runner/compat.rs diff --git a/core/src/nozzle_subgraph/runner/context.rs b/core/src/amp_subgraph/runner/context.rs similarity index 88% rename from core/src/nozzle_subgraph/runner/context.rs rename to core/src/amp_subgraph/runner/context.rs index 7d7ad1db741..2dbb44f8bb8 100644 --- a/core/src/nozzle_subgraph/runner/context.rs +++ b/core/src/amp_subgraph/runner/context.rs @@ -2,21 +2,21 @@ use std::sync::Arc; use alloy::primitives::{BlockHash, BlockNumber}; use graph::{ + amp::{log::Logger as _, Codec, Manifest}, cheap_clone::CheapClone, components::store::WritableStore, data::subgraph::DeploymentHash, - env::NozzleEnv, - nozzle::{log::Logger as _, Codec, Manifest}, + env::AmpEnv, util::backoff::ExponentialBackoff, }; use slog::Logger; use super::Compat; -use crate::nozzle_subgraph::Metrics; +use crate::amp_subgraph::Metrics; -pub(in super::super) struct Context { +pub(in super::super) struct Context { pub(super) logger: Logger, - pub(super) client: Arc, + pub(super) client: Arc, pub(super) store: Arc, pub(super) max_buffer_size: usize, pub(super) max_block_range: usize, @@ -27,17 +27,17 @@ pub(in super::super) struct Context { pub(super) codec: Codec, } -impl Context { +impl Context { pub(in super::super) fn new( logger: &Logger, - env: &NozzleEnv, - client: Arc, + env: &AmpEnv, + client: Arc, store: Arc, deployment: DeploymentHash, manifest: Manifest, metrics: Metrics, ) -> Self { - let logger = logger.component("NozzleSubgraphRunner"); + let logger = logger.component("AmpSubgraphRunner"); let backoff = ExponentialBackoff::new(env.query_retry_min_delay, env.query_retry_max_delay); let codec = Codec::new(manifest.schema.cheap_clone()); diff --git a/core/src/nozzle_subgraph/runner/data_processing.rs b/core/src/amp_subgraph/runner/data_processing.rs similarity index 96% rename from core/src/nozzle_subgraph/runner/data_processing.rs rename to core/src/amp_subgraph/runner/data_processing.rs index b893d1f232f..bb6dc87597a 100644 --- a/core/src/nozzle_subgraph/runner/data_processing.rs +++ b/core/src/amp_subgraph/runner/data_processing.rs @@ -5,20 +5,20 @@ use anyhow::anyhow; use arrow::array::RecordBatch; use chrono::{DateTime, Utc}; use graph::{ - blockchain::block_stream::FirehoseCursor, - cheap_clone::CheapClone, - components::store::{EntityCache, ModificationsAndCache}, - nozzle::{ + amp::{ codec::{utils::auto_block_timestamp_decoder, DecodeOutput, DecodedEntity, Decoder}, stream_aggregator::{RecordBatchGroup, RecordBatchGroups, StreamRecordBatch}, }, + blockchain::block_stream::FirehoseCursor, + cheap_clone::CheapClone, + components::store::{EntityCache, ModificationsAndCache}, }; use slog::{debug, trace}; use super::{data_stream::TablePtr, Compat, Context, Error}; -pub(super) async fn process_record_batch_groups( - cx: &mut Context, +pub(super) async fn process_record_batch_groups( + cx: &mut Context, mut entity_cache: EntityCache, record_batch_groups: RecordBatchGroups, stream_table_ptr: Arc<[TablePtr]>, @@ -72,8 +72,8 @@ pub(super) async fn process_record_batch_groups( Ok(entity_cache) } -async fn process_record_batch_group( - cx: &mut Context, +async fn process_record_batch_group( + cx: &mut Context, mut entity_cache: EntityCache, block_number: BlockNumber, block_hash: BlockHash, @@ -146,8 +146,8 @@ async fn process_record_batch_group( )) } -async fn process_record_batch( - cx: &mut Context, +async fn process_record_batch( + cx: &mut Context, entity_cache: &mut EntityCache, block_number: BlockNumber, record_batch: RecordBatch, diff --git a/core/src/nozzle_subgraph/runner/data_stream.rs b/core/src/amp_subgraph/runner/data_stream.rs similarity index 96% rename from core/src/nozzle_subgraph/runner/data_stream.rs rename to core/src/amp_subgraph/runner/data_stream.rs index f09f332068d..1e8211b3e8e 100644 --- a/core/src/nozzle_subgraph/runner/data_stream.rs +++ b/core/src/amp_subgraph/runner/data_stream.rs @@ -7,12 +7,12 @@ use futures::{ StreamExt, TryStreamExt, }; use graph::{ - cheap_clone::CheapClone, - nozzle::{ + amp::{ manifest::DataSource, stream_aggregator::{RecordBatchGroups, StreamAggregator}, Client, }, + cheap_clone::CheapClone, }; use slog::{debug, warn}; @@ -20,12 +20,12 @@ use super::{Context, Error}; pub(super) type TablePtr = (usize, usize); -pub(super) fn new_data_stream( - cx: &Context, +pub(super) fn new_data_stream( + cx: &Context, latest_block: BlockNumber, ) -> BoxStream<'static, Result<(RecordBatchGroups, Arc<[TablePtr]>), Error>> where - NC: Client, + AC: Client, { let logger = cx.logger.new(slog::o!("process" => "new_data_stream")); @@ -118,8 +118,8 @@ where merged_data_stream } -fn next_block_ranges( - cx: &Context, +fn next_block_ranges( + cx: &Context, latest_queried_block: Option, latest_block: BlockNumber, ) -> HashMap> { @@ -148,8 +148,8 @@ fn next_block_ranges( .collect() } -fn next_block_range( - cx: &Context, +fn next_block_range( + cx: &Context, data_source: &DataSource, latest_queried_block: Option, latest_block: BlockNumber, diff --git a/core/src/nozzle_subgraph/runner/error.rs b/core/src/amp_subgraph/runner/error.rs similarity index 96% rename from core/src/nozzle_subgraph/runner/error.rs rename to core/src/amp_subgraph/runner/error.rs index a59ecc30576..8c7077e1c68 100644 --- a/core/src/nozzle_subgraph/runner/error.rs +++ b/core/src/amp_subgraph/runner/error.rs @@ -1,4 +1,4 @@ -use graph::nozzle::error::IsDeterministic; +use graph::amp::error::IsDeterministic; use thiserror::Error; #[derive(Debug, Error)] diff --git a/core/src/nozzle_subgraph/runner/latest_blocks.rs b/core/src/amp_subgraph/runner/latest_blocks.rs similarity index 92% rename from core/src/nozzle_subgraph/runner/latest_blocks.rs rename to core/src/amp_subgraph/runner/latest_blocks.rs index 55a4ef2dc93..e0fc7b5e1b4 100644 --- a/core/src/nozzle_subgraph/runner/latest_blocks.rs +++ b/core/src/amp_subgraph/runner/latest_blocks.rs @@ -3,8 +3,7 @@ use anyhow::anyhow; use arrow::array::RecordBatch; use futures::{future::try_join_all, stream::BoxStream, StreamExt, TryFutureExt}; use graph::{ - cheap_clone::CheapClone, - nozzle::{ + amp::{ client::ResponseBatch, codec::{utils::block_number_decoder, Decoder}, common::Ident, @@ -12,6 +11,7 @@ use graph::{ manifest::DataSource, Client, }, + cheap_clone::CheapClone, }; use itertools::Itertools; use slog::debug; @@ -23,9 +23,9 @@ pub(super) type TablePtr = (usize, usize); pub(super) struct LatestBlocks(Vec<(TablePtr, BlockNumber)>); impl LatestBlocks { - pub(super) async fn load(cx: &Context) -> Result + pub(super) async fn load(cx: &Context) -> Result where - NC: Client, + AC: Client, { debug!(cx.logger, "Loading latest blocks"); @@ -57,9 +57,9 @@ impl LatestBlocks { try_join_all(latest_block_futs).await.map(Self) } - pub(super) fn filter_completed(self, cx: &Context) -> Self + pub(super) fn filter_completed(self, cx: &Context) -> Self where - NC: Client, + AC: Client, { let latest_synced_block = cx.latest_synced_block(); @@ -81,9 +81,9 @@ impl LatestBlocks { .unwrap() } - pub(super) async fn changed(self, cx: &Context) -> Result<(), Error> + pub(super) async fn changed(self, cx: &Context) -> Result<(), Error> where - NC: Client, + AC: Client, { debug!(cx.logger, "Waiting for new blocks"); @@ -125,13 +125,13 @@ fn indexing_completed(data_source: &DataSource, latest_synced_block: &Option= data_source.source.end_block) } -async fn latest_block( - cx: &Context, +async fn latest_block( + cx: &Context, dataset: &Ident, table: &Ident, ) -> Result where - NC: Client, + AC: Client, { let query = format!("SELECT MAX(_block_num) FROM {dataset}.{table}"); let stream = cx.client.query(&cx.logger, query, None); @@ -146,14 +146,14 @@ where Ok(latest_block) } -async fn latest_block_changed( - cx: &Context, +async fn latest_block_changed( + cx: &Context, dataset: &Ident, table: &Ident, latest_block: BlockNumber, ) -> Result<(), Error> where - NC: Client, + AC: Client, { let query = format!("SELECT _block_num FROM {dataset}.{table} WHERE _block_num > {latest_block} SETTINGS stream = true"); let stream = cx.client.query(&cx.logger, query, None); diff --git a/core/src/nozzle_subgraph/runner/mod.rs b/core/src/amp_subgraph/runner/mod.rs similarity index 91% rename from core/src/nozzle_subgraph/runner/mod.rs rename to core/src/amp_subgraph/runner/mod.rs index c7088b56a11..b7e65d62851 100644 --- a/core/src/nozzle_subgraph/runner/mod.rs +++ b/core/src/amp_subgraph/runner/mod.rs @@ -9,8 +9,8 @@ mod reorg_handler; use anyhow::Result; use futures::{future::BoxFuture, StreamExt}; use graph::{ - cheap_clone::CheapClone, components::store::EntityCache, data::subgraph::schema::SubgraphError, - nozzle::Client, + amp::Client, cheap_clone::CheapClone, components::store::EntityCache, + data::subgraph::schema::SubgraphError, }; use slog::{debug, error, warn}; use tokio_util::sync::CancellationToken; @@ -22,11 +22,11 @@ use self::{ pub(super) use self::context::Context; -pub(super) fn new_runner( - mut cx: Context, +pub(super) fn new_runner( + mut cx: Context, ) -> Box BoxFuture<'static, Result<()>> + Send + 'static> where - NC: Client + Send + Sync + 'static, + AC: Client + Send + Sync + 'static, { Box::new(move |cancel_token| { Box::pin(async move { @@ -47,9 +47,9 @@ where }) } -async fn run_indexing(cx: &mut Context) -> Result<(), Error> +async fn run_indexing(cx: &mut Context) -> Result<(), Error> where - NC: Client, + AC: Client, { loop { debug!(cx.logger, "Running indexing"; @@ -100,9 +100,9 @@ where } } -async fn run_indexing_with_retries(cx: &mut Context) -> Result<()> +async fn run_indexing_with_retries(cx: &mut Context) -> Result<()> where - NC: Client, + AC: Client, { loop { match run_indexing(cx).await { diff --git a/core/src/nozzle_subgraph/runner/reorg_handler.rs b/core/src/amp_subgraph/runner/reorg_handler.rs similarity index 95% rename from core/src/nozzle_subgraph/runner/reorg_handler.rs rename to core/src/amp_subgraph/runner/reorg_handler.rs index e18886e6168..5e6fb6beff7 100644 --- a/core/src/nozzle_subgraph/runner/reorg_handler.rs +++ b/core/src/amp_subgraph/runner/reorg_handler.rs @@ -2,24 +2,24 @@ use alloy::primitives::{BlockHash, BlockNumber}; use anyhow::anyhow; use futures::{future::try_join_all, StreamExt, TryFutureExt}; use graph::{ - blockchain::block_stream::FirehoseCursor, - nozzle::{ + amp::{ client::{LatestBlockBeforeReorg, RequestMetadata, ResponseBatch, ResumeStreamingQuery}, common::Ident, Client, }, + blockchain::block_stream::FirehoseCursor, }; use itertools::Itertools; use slog::debug; use super::{Compat, Context, Error, LatestBlocks}; -pub(super) async fn check_and_handle_reorg( - cx: &Context, +pub(super) async fn check_and_handle_reorg( + cx: &Context, latest_blocks: &LatestBlocks, ) -> Result<(), Error> where - NC: Client, + AC: Client, { let logger = cx .logger @@ -83,14 +83,14 @@ where Ok(()) } -async fn detect_deepest_reorg( - cx: &Context, +async fn detect_deepest_reorg( + cx: &Context, latest_blocks: &LatestBlocks, latest_synced_block_number: BlockNumber, latest_synced_block_hash: BlockHash, ) -> Result, Error> where - NC: Client, + AC: Client, { let detect_reorg_futs = latest_blocks .iter() @@ -121,8 +121,8 @@ where Ok(deepest_reorg) } -async fn detect_reorg( - cx: &Context, +async fn detect_reorg( + cx: &Context, network: &str, dataset: &Ident, table: &Ident, @@ -130,7 +130,7 @@ async fn detect_reorg( latest_synced_block_hash: BlockHash, ) -> Result, Error> where - NC: Client, + AC: Client, { let query = format!("SELECT _block_num FROM {dataset}.{table} SETTINGS stream = true"); let mut stream = cx.client.query( diff --git a/core/src/lib.rs b/core/src/lib.rs index 45a4a7896d5..61de81c0b64 100644 --- a/core/src/lib.rs +++ b/core/src/lib.rs @@ -1,4 +1,4 @@ -pub mod nozzle_subgraph; +pub mod amp_subgraph; pub mod polling_monitor; pub mod subgraph; diff --git a/core/src/subgraph/context/instance/mod.rs b/core/src/subgraph/context/instance/mod.rs index ade6981a6ee..0d14ae8d758 100644 --- a/core/src/subgraph/context/instance/mod.rs +++ b/core/src/subgraph/context/instance/mod.rs @@ -182,7 +182,7 @@ where Ok(Some(host)) } } - DataSource::Nozzle(_) => unreachable!(), + DataSource::Amp(_) => unreachable!(), } } diff --git a/core/src/subgraph/instance_manager.rs b/core/src/subgraph/instance_manager.rs index c2b0dfaf468..77b4b7288f1 100644 --- a/core/src/subgraph/instance_manager.rs +++ b/core/src/subgraph/instance_manager.rs @@ -9,6 +9,7 @@ use crate::subgraph::Decoder; use std::collections::BTreeSet; use crate::subgraph::runner::SubgraphRunner; +use graph::amp; use graph::blockchain::block_stream::{BlockStreamMetrics, TriggersAdapterWrapper}; use graph::blockchain::{Blockchain, BlockchainKind, DataSource, NodeCapabilities}; use graph::components::link_resolver::LinkResolverContext; @@ -20,7 +21,6 @@ use graph::data::subgraph::{UnresolvedSubgraphManifest, SPEC_VERSION_0_0_6}; use graph::data::value::Word; use graph::data_source::causality_region::CausalityRegionSeq; use graph::env::EnvVars; -use graph::nozzle; use graph::prelude::{SubgraphInstanceManager as SubgraphInstanceManagerTrait, *}; use graph::{blockchain::BlockchainMap, components::store::DeploymentLocator}; use graph_runtime_wasm::module::ToAscPtr; @@ -32,7 +32,7 @@ use super::SubgraphTriggerProcessor; use crate::subgraph::runner::SubgraphRunnerError; #[derive(Clone)] -pub struct SubgraphInstanceManager { +pub struct SubgraphInstanceManager { logger_factory: LoggerFactory, subgraph_store: Arc, chains: Arc, @@ -41,7 +41,7 @@ pub struct SubgraphInstanceManager { link_resolver: Arc, ipfs_service: IpfsService, arweave_service: ArweaveService, - nozzle_client: Option>, + amp_client: Option>, static_filters: bool, env_vars: Arc, @@ -61,7 +61,7 @@ pub struct SubgraphInstanceManager { #[async_trait] impl SubgraphInstanceManagerTrait for SubgraphInstanceManager where - NC: nozzle::Client + Send + Sync + 'static, + NC: amp::Client + Send + Sync + 'static, { async fn start_subgraph( self: Arc, @@ -189,7 +189,7 @@ where } } -impl SubgraphInstanceManager { +impl SubgraphInstanceManager { pub fn new( logger_factory: &LoggerFactory, env_vars: Arc, @@ -200,7 +200,7 @@ impl SubgraphInstanceManager { link_resolver: Arc, ipfs_service: IpfsService, arweave_service: ArweaveService, - nozzle_client: Option>, + amp_client: Option>, static_filters: bool, ) -> Self { let logger = logger_factory.component_logger("SubgraphInstanceManager", None); @@ -214,7 +214,7 @@ impl SubgraphInstanceManager { instances: SubgraphKeepAlive::new(sg_metrics), link_resolver, ipfs_service, - nozzle_client, + amp_client, static_filters, env_vars, arweave_service, @@ -332,7 +332,7 @@ impl SubgraphInstanceManager { .resolve( &deployment.hash, &link_resolver, - self.nozzle_client.cheap_clone(), + self.amp_client.cheap_clone(), &logger, ENV_VARS.max_spec_version.clone(), ) diff --git a/core/src/subgraph/registrar.rs b/core/src/subgraph/registrar.rs index c67c3dbf293..11892bcb53c 100644 --- a/core/src/subgraph/registrar.rs +++ b/core/src/subgraph/registrar.rs @@ -1,6 +1,7 @@ use std::collections::HashSet; use async_trait::async_trait; +use graph::amp; use graph::blockchain::{Blockchain, BlockchainKind, BlockchainMap}; use graph::components::{ link_resolver::LinkResolverContext, @@ -12,19 +13,18 @@ use graph::data::{ value::Word, }; use graph::futures03::{self, future::TryFutureExt, Stream, StreamExt}; -use graph::nozzle; use graph::prelude::{CreateSubgraphResult, SubgraphRegistrar as SubgraphRegistrarTrait, *}; use graph::tokio_retry::Retry; use graph::util::futures::{retry_strategy, RETRY_DEFAULT_LIMIT}; -pub struct SubgraphRegistrar { +pub struct SubgraphRegistrar { logger: Logger, logger_factory: LoggerFactory, resolver: Arc, provider: Arc

, store: Arc, subscription_manager: Arc, - nozzle_client: Option>, + amp_client: Option>, chains: Arc, node_id: NodeId, version_switching_mode: SubgraphVersionSwitchingMode, @@ -32,12 +32,12 @@ pub struct SubgraphRegistrar { settings: Arc, } -impl SubgraphRegistrar +impl SubgraphRegistrar where P: graph::components::subgraph::SubgraphInstanceManager, S: SubgraphStore, SM: SubscriptionManager, - NC: nozzle::Client + Send + Sync + 'static, + AC: amp::Client + Send + Sync + 'static, { pub fn new( logger_factory: &LoggerFactory, @@ -45,7 +45,7 @@ where provider: Arc

, store: Arc, subscription_manager: Arc, - nozzle_client: Option>, + amp_client: Option>, chains: Arc, node_id: NodeId, version_switching_mode: SubgraphVersionSwitchingMode, @@ -63,7 +63,7 @@ where provider, store, subscription_manager, - nozzle_client, + amp_client, chains, node_id, version_switching_mode, @@ -222,12 +222,12 @@ where } #[async_trait] -impl SubgraphRegistrarTrait for SubgraphRegistrar +impl SubgraphRegistrarTrait for SubgraphRegistrar where P: graph::components::subgraph::SubgraphInstanceManager, S: SubgraphStore, SM: SubscriptionManager, - NC: nozzle::Client + Send + Sync + 'static, + AC: amp::Client + Send + Sync + 'static, { async fn create_subgraph( &self, @@ -312,7 +312,7 @@ where debug_fork, self.version_switching_mode, &resolver, - self.nozzle_client.cheap_clone(), + self.amp_client.cheap_clone(), history_blocks, ) .await? @@ -331,7 +331,7 @@ where debug_fork, self.version_switching_mode, &resolver, - self.nozzle_client.cheap_clone(), + self.amp_client.cheap_clone(), history_blocks, ) .await? @@ -350,7 +350,7 @@ where debug_fork, self.version_switching_mode, &resolver, - self.nozzle_client.cheap_clone(), + self.amp_client.cheap_clone(), history_blocks, ) .await? @@ -464,7 +464,7 @@ async fn resolve_graft_block( }) } -async fn create_subgraph_version( +async fn create_subgraph_version( logger: &Logger, store: Arc, chains: Arc, @@ -477,7 +477,7 @@ async fn create_subgraph_version, version_switching_mode: SubgraphVersionSwitchingMode, resolver: &Arc, - nozzle_client: Option>, + amp_client: Option>, history_blocks_override: Option, ) -> Result { let raw_string = serde_yaml::to_string(&raw).unwrap(); @@ -486,7 +486,7 @@ async fn create_subgraph_version Result { let is_https = addr.scheme() == Some(&http::uri::Scheme::HTTPS); let mut endpoint = Endpoint::from(addr) @@ -82,7 +82,7 @@ impl Client for FlightClient { logger: &Logger, query: impl ToString, ) -> BoxFuture<'static, Result> { - let logger = logger.component("nozzle::FlightClient"); + let logger = logger.component("AmpFlightClient"); let mut raw_client = self.raw_client(); let query = query.to_string(); @@ -110,7 +110,7 @@ impl Client for FlightClient { ) -> BoxStream<'static, Result> { let query = query.to_string(); let logger = logger - .component("nozzle::FlightClient") + .component("AmpFlightClient") .new(slog::o!("query_id" => query_id(&query))); let mut raw_client = self.raw_client(); @@ -129,7 +129,7 @@ impl Client for FlightClient { .collect(); raw_client.set_header( - "nozzle-resume", + "amp-resume", serialize_resume_streaming_query(resume_streaming_query), ); } @@ -252,7 +252,7 @@ impl error::IsDeterministic for Error { /// Metadata received with every record batch. #[derive(Debug, Clone, Deserialize)] struct Metadata { - /// Block ranges processed by the Nozzle server to produce the record batch. + /// Block ranges processed by the Amp server to produce the record batch. ranges: Vec, } @@ -267,7 +267,7 @@ impl Metadata { } } -/// Block range processed by the Nozzle server to produce a record batch. +/// Block range processed by the Amp server to produce a record batch. #[derive(Debug, Clone, PartialEq, Eq, Deserialize)] struct BlockRange { /// Network that contains the source data for the dataset. diff --git a/graph/src/nozzle/client/mod.rs b/graph/src/amp/client/mod.rs similarity index 88% rename from graph/src/nozzle/client/mod.rs rename to graph/src/amp/client/mod.rs index 0832ccf8864..34999da03fa 100644 --- a/graph/src/nozzle/client/mod.rs +++ b/graph/src/amp/client/mod.rs @@ -7,9 +7,9 @@ use arrow::{array::RecordBatch, datatypes::Schema}; use futures03::{future::BoxFuture, stream::BoxStream}; use slog::Logger; -use crate::nozzle::error; +use crate::amp::error; -/// Client for connecting to Nozzle core and executing SQL queries. +/// Client for connecting to Amp core and executing SQL queries. pub trait Client { type Error: Error + error::IsDeterministic + Send + Sync + 'static; @@ -29,7 +29,7 @@ pub trait Client { ) -> BoxStream<'static, Result>; } -/// Metadata sent to the Nozzle server with the SQL query. +/// Metadata sent to the Amp server with the SQL query. #[derive(Debug, Clone)] pub struct RequestMetadata { /// Allows resuming streaming SQL queries from any block. @@ -53,13 +53,13 @@ pub struct ResumeStreamingQuery { pub block_hash: BlockHash, } -/// Represents a batch response resulting from query execution on the Nozzle server. +/// Represents a batch response resulting from query execution on the Amp server. #[derive(Debug, Clone)] pub enum ResponseBatch { - /// Contains the batch data received from the Nozzle server. + /// Contains the batch data received from the Amp server. Batch { data: RecordBatch }, - /// Contains the reorg message received from the Nozzle server. + /// Contains the reorg message received from the Amp server. /// /// It is received before the record batch that contains the data after the reorg. Reorg(Vec), diff --git a/graph/src/nozzle/codec/array_decoder.rs b/graph/src/amp/codec/array_decoder.rs similarity index 99% rename from graph/src/nozzle/codec/array_decoder.rs rename to graph/src/amp/codec/array_decoder.rs index 8de0cbb2702..d8bc677472a 100644 --- a/graph/src/nozzle/codec/array_decoder.rs +++ b/graph/src/amp/codec/array_decoder.rs @@ -18,7 +18,7 @@ use chrono::{DateTime, Utc}; use super::decoder::Decoder; use crate::data::store::scalar::{BigDecimal, BigInt}; -/// Decodes Arrow arrays into Subgraph types. +/// Decodes Arrow arrays into subgraph types. pub struct ArrayDecoder<'a, T: 'static>(&'a T); impl<'a, T> ArrayDecoder<'a, T> diff --git a/graph/src/nozzle/codec/decoder.rs b/graph/src/amp/codec/decoder.rs similarity index 93% rename from graph/src/nozzle/codec/decoder.rs rename to graph/src/amp/codec/decoder.rs index c0c479ab292..9b5c340e891 100644 --- a/graph/src/nozzle/codec/decoder.rs +++ b/graph/src/amp/codec/decoder.rs @@ -1,6 +1,6 @@ use anyhow::Result; -/// Decodes Arrow data at specific row indices into Subgraph types. +/// Decodes Arrow data at specific row indices into subgraph types. /// /// This trait provides a common interface for converting Arrow format data into /// custom types. Implementations handle the specifics of extracting data from diff --git a/graph/src/nozzle/codec/list_decoder.rs b/graph/src/amp/codec/list_decoder.rs similarity index 100% rename from graph/src/nozzle/codec/list_decoder.rs rename to graph/src/amp/codec/list_decoder.rs diff --git a/graph/src/nozzle/codec/mapping_decoder.rs b/graph/src/amp/codec/mapping_decoder.rs similarity index 100% rename from graph/src/nozzle/codec/mapping_decoder.rs rename to graph/src/amp/codec/mapping_decoder.rs diff --git a/graph/src/nozzle/codec/mod.rs b/graph/src/amp/codec/mod.rs similarity index 98% rename from graph/src/nozzle/codec/mod.rs rename to graph/src/amp/codec/mod.rs index a116c06c77f..b2b016322f6 100644 --- a/graph/src/nozzle/codec/mod.rs +++ b/graph/src/amp/codec/mod.rs @@ -14,18 +14,18 @@ use arrow::array::{Array, RecordBatch}; use self::{list_decoder::ListDecoder, mapping_decoder::MappingDecoder, name_cache::NameCache}; use crate::{ + amp::common::Ident, data::{ graphql::TypeExt, store::{Id, IdType, Value}, value::Word, }, - nozzle::common::Ident, schema::{EntityKey, EntityType, Field, InputSchema}, }; pub use self::{array_decoder::ArrayDecoder, decoder::Decoder}; -/// Handles decoding of record batches to Subgraph entities. +/// Handles decoding of record batches to subgraph entities. pub struct Codec { input_schema: InputSchema, name_cache: NameCache, diff --git a/graph/src/nozzle/codec/name_cache.rs b/graph/src/amp/codec/name_cache.rs similarity index 90% rename from graph/src/nozzle/codec/name_cache.rs rename to graph/src/amp/codec/name_cache.rs index 07bf874fccc..ed8afc79c80 100644 --- a/graph/src/nozzle/codec/name_cache.rs +++ b/graph/src/amp/codec/name_cache.rs @@ -2,9 +2,9 @@ use std::collections::HashMap; use anyhow::Result; -use crate::{cheap_clone::CheapClone, nozzle::common::Ident}; +use crate::{amp::common::Ident, cheap_clone::CheapClone}; -/// Caches identifiers that are used to match Arrow columns and Subgraph entity fields. +/// Caches identifiers that are used to match Arrow columns and subgraph entity fields. pub(super) struct NameCache { cache: HashMap, Ident>, } diff --git a/graph/src/nozzle/codec/utils.rs b/graph/src/amp/codec/utils.rs similarity index 98% rename from graph/src/nozzle/codec/utils.rs rename to graph/src/amp/codec/utils.rs index bf4581f93ca..dab04699005 100644 --- a/graph/src/nozzle/codec/utils.rs +++ b/graph/src/amp/codec/utils.rs @@ -6,7 +6,7 @@ use arrow::array::{ use chrono::{DateTime, Utc}; use super::{ArrayDecoder, Decoder}; -use crate::nozzle::common::column_aliases; +use crate::amp::common::column_aliases; pub fn auto_block_number_decoder<'a>( record_batch: &'a RecordBatch, diff --git a/graph/src/nozzle/codec/value_decoder.rs b/graph/src/amp/codec/value_decoder.rs similarity index 99% rename from graph/src/nozzle/codec/value_decoder.rs rename to graph/src/amp/codec/value_decoder.rs index 3a56f9b21a8..99fa3969d67 100644 --- a/graph/src/nozzle/codec/value_decoder.rs +++ b/graph/src/amp/codec/value_decoder.rs @@ -19,11 +19,11 @@ use crate::data::store::{ Value, ValueType, }; -/// Returns a decoder that converts an Arrow array into Subgraph store values. +/// Returns a decoder that converts an Arrow array into subgraph store values. /// /// # Errors /// -/// Returns an error if the Subgraph store type is not compatible with the Arrow array type. +/// Returns an error if the subgraph store type is not compatible with the Arrow array type. /// /// The returned error is deterministic. pub(super) fn value_decoder<'a>( diff --git a/graph/src/nozzle/common/ident.rs b/graph/src/amp/common/ident.rs similarity index 95% rename from graph/src/nozzle/common/ident.rs rename to graph/src/amp/common/ident.rs index 328140c77af..456bf7efef8 100644 --- a/graph/src/nozzle/common/ident.rs +++ b/graph/src/amp/common/ident.rs @@ -22,7 +22,7 @@ use crate::derive::CheapClone; /// # Example /// /// ```rust -/// # use graph::nozzle::common::Ident; +/// # use graph::amp::common::Ident; /// /// assert_eq!(Ident::new("block_hash").unwrap(), Ident::new("blockHash").unwrap()); /// assert_eq!(Ident::new("block-hash").unwrap(), Ident::new("BlockHash").unwrap()); @@ -59,7 +59,7 @@ impl Ident { /// # Example /// /// ```rust - /// # use graph::nozzle::common::Ident; + /// # use graph::amp::common::Ident; /// /// let ident = Ident::new("BLOCK_hash").unwrap(); /// assert_eq!(ident.as_str(), "BLOCK_hash"); @@ -75,7 +75,7 @@ impl Ident { /// # Example /// /// ```rust - /// # use graph::nozzle::common::Ident; + /// # use graph::amp::common::Ident; /// /// let ident = Ident::new("blockHash").unwrap(); /// assert_eq!(ident.tokens(), &["block".into(), "hash".into()]); @@ -89,7 +89,7 @@ impl Ident { /// # Example /// /// ```rust - /// # use graph::nozzle::common::Ident; + /// # use graph::amp::common::Ident; /// /// let ident = Ident::new("block_hash").unwrap(); /// assert_eq!(ident.to_lower_camel_case(), "blockHash"); @@ -103,7 +103,7 @@ impl Ident { /// # Example /// /// ```rust - /// # use graph::nozzle::common::Ident; + /// # use graph::amp::common::Ident; /// /// let ident = Ident::new("block_hash").unwrap(); /// assert_eq!(ident.to_upper_camel_case(), "BlockHash"); diff --git a/graph/src/nozzle/common/mod.rs b/graph/src/amp/common/mod.rs similarity index 76% rename from graph/src/nozzle/common/mod.rs rename to graph/src/amp/common/mod.rs index fab138d7d3d..94d51495258 100644 --- a/graph/src/nozzle/common/mod.rs +++ b/graph/src/amp/common/mod.rs @@ -3,17 +3,17 @@ mod ident; pub use self::ident::Ident; pub(super) mod column_aliases { - pub(in crate::nozzle) static BLOCK_NUMBER: &[&str] = &[ + pub(in crate::amp) static BLOCK_NUMBER: &[&str] = &[ "_block_num", // Meta column present in all tables "block_num", // Standard column in most raw tables "block", // Common alternative name "block_number", // Common alternative name ]; - pub(in crate::nozzle) static BLOCK_HASH: &[&str] = &[ + pub(in crate::amp) static BLOCK_HASH: &[&str] = &[ "hash", // Standard column in some raw tables "block_hash", // Standard column in most raw tables and common alternative name ]; - pub(in crate::nozzle) static BLOCK_TIMESTAMP: &[&str] = &[ + pub(in crate::amp) static BLOCK_TIMESTAMP: &[&str] = &[ "timestamp", // Standard column in most raw tables "block_timestamp", // Common alternative name ]; diff --git a/graph/src/nozzle/error.rs b/graph/src/amp/error.rs similarity index 100% rename from graph/src/nozzle/error.rs rename to graph/src/amp/error.rs diff --git a/graph/src/nozzle/log.rs b/graph/src/amp/log.rs similarity index 87% rename from graph/src/nozzle/log.rs rename to graph/src/amp/log.rs index f494a46a8de..e11c129b6b7 100644 --- a/graph/src/nozzle/log.rs +++ b/graph/src/amp/log.rs @@ -2,7 +2,7 @@ use std::borrow::Cow; use lazy_regex::regex_replace_all; -/// Extends the [slog::Logger] with methods commonly used in Nozzle modules +/// Extends the [slog::Logger] with methods commonly used in Amp modules pub trait Logger { /// Creates a new child logger scoped to a specific component fn component(&self, name: &'static str) -> slog::Logger; diff --git a/graph/src/nozzle/manifest/data_source/mod.rs b/graph/src/amp/manifest/data_source/mod.rs similarity index 89% rename from graph/src/nozzle/manifest/data_source/mod.rs rename to graph/src/amp/manifest/data_source/mod.rs index bcca6aed7b4..85de05ec951 100644 --- a/graph/src/nozzle/manifest/data_source/mod.rs +++ b/graph/src/amp/manifest/data_source/mod.rs @@ -8,13 +8,13 @@ use arrow::datatypes::Schema; use semver::Version; use crate::{ + amp::{common::Ident, sql::Query}, data::subgraph::SPEC_VERSION_1_5_0, - nozzle::{common::Ident, sql::Query}, }; pub use self::raw::RawDataSource; -/// Represents a valid data source of a Nozzle Subgraph. +/// Represents a valid data source of a Amp subgraph. /// /// This data source contains parsed, formatted, and resolved data. #[derive(Debug, Clone)] @@ -30,12 +30,12 @@ pub struct DataSource { /// Contains the sources used by this data source. pub source: Source, - /// Contains the transformations of source tables indexed by the Subgraph. + /// Contains the transformations of source tables indexed by the subgraph. pub transformer: Transformer, } impl DataSource { - pub const KIND: &str = "nozzle"; + pub const KIND: &str = "amp"; pub const MIN_SPEC_VERSION: Version = SPEC_VERSION_1_5_0; } @@ -69,7 +69,7 @@ pub struct Source { pub end_block: BlockNumber, } -/// Contains the transformations of source tables indexed by the Subgraph. +/// Contains the transformations of source tables indexed by the subgraph. #[derive(Debug, Clone)] pub struct Transformer { /// The version of this transformer. @@ -100,16 +100,16 @@ pub struct Abi { pub struct Table { /// The name of the transformed table. /// - /// Must reference a valid entity name from the Subgraph schema. + /// Must reference a valid entity name from the subgraph schema. pub name: Ident, - /// The SQL query that executes on the Nozzle server. + /// The SQL query that executes on the Amp server. /// - /// The data resulting from this SQL query execution transforms into Subgraph entities. + /// The data resulting from this SQL query execution transforms into subgraph entities. pub query: Query, /// The Arrow schema of this transformed table SQL query. /// - /// This schema loads from the Nozzle server. + /// This schema loads from the Amp server. pub schema: Schema, } diff --git a/graph/src/nozzle/manifest/data_source/raw.rs b/graph/src/amp/manifest/data_source/raw.rs similarity index 91% rename from graph/src/nozzle/manifest/data_source/raw.rs rename to graph/src/amp/manifest/data_source/raw.rs index ac3323d9b4d..197aacd58b0 100644 --- a/graph/src/nozzle/manifest/data_source/raw.rs +++ b/graph/src/amp/manifest/data_source/raw.rs @@ -14,21 +14,21 @@ use thiserror::Error; use super::{Abi, DataSource, Source, Table, Transformer}; use crate::{ - components::link_resolver::{LinkResolver, LinkResolverContext}, - data::subgraph::DeploymentHash, - nozzle::{ + amp::{ self, common::{column_aliases, Ident}, error::IsDeterministic, sql::Query, }, + components::link_resolver::{LinkResolver, LinkResolverContext}, + data::subgraph::DeploymentHash, }; /// Supported API versions for data source transformers. static API_VERSIONS: LazyLock> = LazyLock::new(|| HashSet::from([Version::new(0, 0, 1)])); -/// Represents an unmodified input data source of a Nozzle Subgraph. +/// Represents an unmodified input data source of a Amp subgraph. /// /// May contain invalid or partial data. #[derive(Debug, Clone, Deserialize)] @@ -41,7 +41,7 @@ pub struct RawDataSource { /// The kind of the data source. /// - /// Must be equal to `nozzle`. + /// Must be equal to `amp`. pub kind: String, /// The network name of the data source. @@ -50,7 +50,7 @@ pub struct RawDataSource { /// Contains sources used by this data source. pub source: RawSource, - /// Contains transformations of source tables indexed by the Subgraph. + /// Contains transformations of source tables indexed by the subgraph. pub transformer: RawTransformer, } @@ -60,7 +60,7 @@ impl RawDataSource { self, logger: &Logger, link_resolver: &dyn LinkResolver, - nozzle_client: &impl nozzle::Client, + amp_client: &impl amp::Client, ) -> Result { let Self { name, @@ -78,7 +78,7 @@ impl RawDataSource { .map_err(|e| e.source_context("invalid `source`"))?; let transformer = transformer - .resolve(logger, link_resolver, nozzle_client, &source) + .resolve(logger, link_resolver, amp_client, &source) .await .map_err(|e| e.source_context("invalid `transformer`"))?; @@ -111,12 +111,12 @@ impl RawDataSource { pub struct RawSource { /// The dataset that SQL queries in the data source can query. /// - /// Must reference a valid dataset name from the Nozzle server. + /// Must reference a valid dataset name from the Amp server. pub dataset: String, /// The tables that SQL queries in the data source can query. /// - /// Must reference valid table names of the dataset from the Nozzle server. + /// Must reference valid table names of the dataset from the Amp server. pub tables: Vec, /// The contract address used by SQL queries in the data source. @@ -192,7 +192,7 @@ impl RawSource { } } -/// Contains unmodified input transformations of source tables indexed by the Subgraph. +/// Contains unmodified input transformations of source tables indexed by the subgraph. /// /// May contain invalid or partial data. #[derive(Debug, Clone, Deserialize)] @@ -200,7 +200,7 @@ impl RawSource { pub struct RawTransformer { /// The version of this transformer. /// - /// Must be a supported API version of the Nozzle Subgraph transformers API. + /// Must be a supported API version of the Amp subgraph transformers API. pub api_version: Version, /// The ABIs that SQL queries can reference to extract event signatures. @@ -219,7 +219,7 @@ impl RawTransformer { self, logger: &Logger, link_resolver: &dyn LinkResolver, - nozzle_client: &impl nozzle::Client, + amp_client: &impl amp::Client, source: &Source, ) -> Result { let Self { @@ -230,8 +230,7 @@ impl RawTransformer { let api_version = Self::resolve_api_version(api_version)?; let abis = Self::resolve_abis(logger, link_resolver, abis).await?; let tables = - Self::resolve_tables(logger, link_resolver, nozzle_client, tables, source, &abis) - .await?; + Self::resolve_tables(logger, link_resolver, amp_client, tables, source, &abis).await?; Ok(Transformer { api_version, @@ -277,7 +276,7 @@ impl RawTransformer { async fn resolve_tables( logger: &Logger, link_resolver: &dyn LinkResolver, - nozzle_client: &impl nozzle::Client, + amp_client: &impl amp::Client, tables: Vec, source: &Source, abis: &[Abi], @@ -296,7 +295,7 @@ impl RawTransformer { let table_futs = tables.into_iter().enumerate().map(|(i, table)| async move { table - .resolve(logger, link_resolver, nozzle_client, source, abis) + .resolve(logger, link_resolver, amp_client, source, abis) .await .map_err(|e| e.source_context(format!("invalid `tables` at index {i}"))) }); @@ -368,17 +367,17 @@ impl RawAbi { pub struct RawTable { /// The name of the transformed table. /// - /// Must reference a valid entity name from the Subgraph schema. + /// Must reference a valid entity name from the subgraph schema. pub name: String, - /// The SQL query that executes on the Nozzle server. + /// The SQL query that executes on the Amp server. /// - /// Transforms the execution results into Subgraph entities. + /// Transforms the execution results into subgraph entities. pub query: Option, - /// The IPFS link to the SQL query that executes on the Nozzle server. + /// The IPFS link to the SQL query that executes on the Amp server. /// - /// Transforms the execution results into Subgraph entities. + /// Transforms the execution results into subgraph entities. /// /// Ignored when `query` is set. pub file: Option, @@ -390,7 +389,7 @@ impl RawTable { self, logger: &Logger, link_resolver: &dyn LinkResolver, - nozzle_client: &impl nozzle::Client, + amp_client: &impl amp::Client, source: &Source, abis: &[Abi], ) -> Result { @@ -400,7 +399,7 @@ impl RawTable { Some(query) => query, None => Self::resolve_file(logger, link_resolver, file, source, abis).await?, }; - let schema = Self::resolve_schema(logger, nozzle_client, &query).await?; + let schema = Self::resolve_schema(logger, amp_client, &query).await?; Ok(Table { name, @@ -479,15 +478,17 @@ impl RawTable { async fn resolve_schema( logger: &Logger, - nozzle_client: &impl nozzle::Client, + amp_client: &impl amp::Client, query: &Query, ) -> Result { - let schema = nozzle_client.schema(logger, &query).await.map_err(|e| { - Error::FailedToExecuteQuery { - is_deterministic: e.is_deterministic(), - source: anyhow!(e).context("failed to load schema"), - } - })?; + let schema = + amp_client + .schema(logger, &query) + .await + .map_err(|e| Error::FailedToExecuteQuery { + is_deterministic: e.is_deterministic(), + source: anyhow!(e).context("failed to load schema"), + })?; let check_required_column = |c: &[&str], kind: &str| { if !c.iter().any(|&c| schema.column_with_name(c).is_some()) { diff --git a/graph/src/nozzle/manifest/mod.rs b/graph/src/amp/manifest/mod.rs similarity index 71% rename from graph/src/nozzle/manifest/mod.rs rename to graph/src/amp/manifest/mod.rs index c0b83744ea9..9a16da1f194 100644 --- a/graph/src/nozzle/manifest/mod.rs +++ b/graph/src/amp/manifest/mod.rs @@ -8,18 +8,18 @@ use semver::Version; use slog::Logger; use crate::{ + amp::Client, blockchain::Blockchain, cheap_clone::CheapClone as _, components::link_resolver::LinkResolver, data::subgraph::{BaseSubgraphManifest, DeploymentHash, UnresolvedSubgraphManifest}, data_source::DataSource as GenericDataSource, - nozzle::Client, schema::InputSchema, }; pub use self::data_source::DataSource; -/// Represents a valid Nozzle subgraph manifest. +/// Represents a valid Amp subgraph manifest. /// /// This manifest contains parsed, formatted, and resolved data. #[derive(Debug, Clone)] @@ -29,18 +29,18 @@ pub struct Manifest { /// Contains all the entities, aggregations, and relationships between them. pub schema: InputSchema, - /// The Nozzle data sources of the subgraph. + /// The Amp data sources of the subgraph. /// - /// A Nozzle subgraph can only contain Nozzle data sources. + /// A Amp subgraph can only contain Amp data sources. pub data_sources: Vec, } impl Manifest { - /// Resolves and returns a valid Nozzle subgraph manifest. - pub async fn resolve( + /// Resolves and returns a valid Amp subgraph manifest. + pub async fn resolve( logger: &Logger, link_resolver: Arc, - nozzle_client: Arc, + amp_client: Arc, max_spec_version: Version, deployment: DeploymentHash, raw_manifest: serde_yaml::Mapping, @@ -53,7 +53,7 @@ impl Manifest { .resolve( &deployment, &link_resolver, - Some(nozzle_client), + Some(amp_client), logger, max_spec_version, ) @@ -75,25 +75,25 @@ impl Manifest { } = resolved_manifest; let data_sources_count = data_sources.len(); - let nozzle_data_sources = data_sources + let amp_data_sources = data_sources .into_iter() .filter_map(|data_source| match data_source { - GenericDataSource::Nozzle(nozzle_data_source) => Some(nozzle_data_source), + GenericDataSource::Amp(amp_data_source) => Some(amp_data_source), _ => None, }) .collect_vec(); - if nozzle_data_sources.is_empty() { - bail!("invalid subgraph manifest: failed to find Nozzle data sources"); + if amp_data_sources.is_empty() { + bail!("invalid subgraph manifest: failed to find Amp data sources"); } - if nozzle_data_sources.len() != data_sources_count { - bail!("invalid subgraph manifest: only Nozzle data sources are allowed"); + if amp_data_sources.len() != data_sources_count { + bail!("invalid subgraph manifest: only Amp data sources are allowed"); } Ok(Self { schema, - data_sources: nozzle_data_sources, + data_sources: amp_data_sources, }) } } diff --git a/graph/src/nozzle/mod.rs b/graph/src/amp/mod.rs similarity index 76% rename from graph/src/nozzle/mod.rs rename to graph/src/amp/mod.rs index 9683c042f3c..9541d450626 100644 --- a/graph/src/nozzle/mod.rs +++ b/graph/src/amp/mod.rs @@ -1,4 +1,4 @@ -//! This module contains the functionality required to support Nozzle Subgraphs. +//! This module contains the functionality required to support Amp subgraphs. pub mod client; pub mod codec; diff --git a/graph/src/nozzle/schema/generator/entity.rs b/graph/src/amp/schema/generator/entity.rs similarity index 91% rename from graph/src/nozzle/schema/generator/entity.rs rename to graph/src/amp/schema/generator/entity.rs index d53249ce948..0770f1c6214 100644 --- a/graph/src/nozzle/schema/generator/entity.rs +++ b/graph/src/amp/schema/generator/entity.rs @@ -2,20 +2,20 @@ use std::fmt; use anyhow::{bail, Context, Result}; -use crate::{cheap_clone::CheapClone, data::store::ValueType, nozzle::common::Ident}; +use crate::{amp::common::Ident, cheap_clone::CheapClone, data::store::ValueType}; -/// A minimal representation of a Subgraph entity. +/// A minimal representation of a subgraph entity. pub(super) struct Entity { name: Ident, fields: Vec, } impl Entity { - /// Converts the Arrow schema to a Subgraph entity. + /// Converts the Arrow schema to a subgraph entity. /// /// # Errors /// - /// Returns an error if Arrow fields cannot be converted to Subgraph entity fields. + /// Returns an error if Arrow fields cannot be converted to subgraph entity fields. /// /// The returned error is deterministic. pub(super) fn new(name: Ident, arrow_schema: arrow::datatypes::Schema) -> Result { @@ -49,7 +49,7 @@ impl fmt::Display for Entity { } } -/// A minimal representation of a Subgraph entity field. +/// A minimal representation of a subgraph entity field. struct Field { name: Ident, value_type: ValueType, @@ -58,13 +58,13 @@ struct Field { } impl Field { - /// Converts the Arrow field to a Subgraph entity field. + /// Converts the Arrow field to a subgraph entity field. /// /// # Errors /// /// Returns an error if: /// - The Arrow field has an invalid name - /// - The Arrow field type cannot be converted to a Subgraph entity value type + /// - The Arrow field type cannot be converted to a subgraph entity value type /// /// The returned error is deterministic. fn new(arrow_field: &arrow::datatypes::Field) -> Result { @@ -80,7 +80,7 @@ impl Field { }) } - /// Creates an `ID` Subgraph entity field. + /// Creates an `ID` subgraph entity field. fn id() -> Self { Self { name: Ident::new("id").unwrap(), diff --git a/graph/src/nozzle/schema/generator/mod.rs b/graph/src/amp/schema/generator/mod.rs similarity index 88% rename from graph/src/nozzle/schema/generator/mod.rs rename to graph/src/amp/schema/generator/mod.rs index fcdef01d970..8ffd673cf6d 100644 --- a/graph/src/nozzle/schema/generator/mod.rs +++ b/graph/src/amp/schema/generator/mod.rs @@ -6,19 +6,19 @@ use itertools::Itertools; use self::entity::Entity; use crate::{ - cheap_clone::CheapClone, data::subgraph::DeploymentHash, nozzle::common::Ident, + amp::common::Ident, cheap_clone::CheapClone, data::subgraph::DeploymentHash, schema::InputSchema, }; -/// Generates a Subgraph schema from a list of Arrow schemas. +/// Generates a subgraph schema from a list of Arrow schemas. /// /// # Limitations /// -/// The generated Subgraph entities are immutable and do not contain any relationships to other entities within the schema. +/// The generated subgraph entities are immutable and do not contain any relationships to other entities within the schema. /// /// # Errors /// -/// Returns an error if any of the Arrow schemas cannot be represented as valid Subgraph entities. +/// Returns an error if any of the Arrow schemas cannot be represented as valid subgraph entities. /// /// The returned error is deterministic. pub fn generate_subgraph_schema( diff --git a/graph/src/nozzle/schema/mod.rs b/graph/src/amp/schema/mod.rs similarity index 100% rename from graph/src/nozzle/schema/mod.rs rename to graph/src/amp/schema/mod.rs diff --git a/graph/src/nozzle/sql/mod.rs b/graph/src/amp/sql/mod.rs similarity index 100% rename from graph/src/nozzle/sql/mod.rs rename to graph/src/amp/sql/mod.rs diff --git a/graph/src/nozzle/sql/query/filter_blocks.rs b/graph/src/amp/sql/query/filter_blocks.rs similarity index 98% rename from graph/src/nozzle/sql/query/filter_blocks.rs rename to graph/src/amp/sql/query/filter_blocks.rs index de1eec59edd..ad2fbae859a 100644 --- a/graph/src/nozzle/sql/query/filter_blocks.rs +++ b/graph/src/amp/sql/query/filter_blocks.rs @@ -7,7 +7,7 @@ use alloy::primitives::BlockNumber; use sqlparser_latest::ast::{self, VisitMut, VisitorMut}; use super::parse; -use crate::{cheap_clone::CheapClone, nozzle::common::Ident}; +use crate::{amp::common::Ident, cheap_clone::CheapClone}; /// Applies a block range filter to the SQL query. /// diff --git a/graph/src/nozzle/sql/query/mod.rs b/graph/src/amp/sql/query/mod.rs similarity index 94% rename from graph/src/nozzle/sql/query/mod.rs rename to graph/src/amp/sql/query/mod.rs index 28b3496ea93..8918da2009f 100644 --- a/graph/src/nozzle/sql/query/mod.rs +++ b/graph/src/amp/sql/query/mod.rs @@ -13,12 +13,12 @@ use anyhow::{bail, Context, Result}; use itertools::Itertools; use sqlparser_latest::ast; -use crate::{cheap_clone::CheapClone, nozzle::common::Ident}; +use crate::{amp::common::Ident, cheap_clone::CheapClone}; -/// Represents a valid SQL query of a Nozzle Subgraph. +/// Represents a valid SQL query of a Amp subgraph. /// -/// Parses, validates and resolves a SQL query and prepares it for execution on a Nozzle server. -/// The data returned by executing this query is used to create Subgraph entities. +/// Parses, validates and resolves a SQL query and prepares it for execution on a Amp server. +/// The data returned by executing this query is used to create subgraph entities. #[derive(Debug, Clone)] pub struct Query { /// The raw SQL AST that represents the SQL query. @@ -32,7 +32,7 @@ pub struct Query { } impl Query { - /// Parses, validates and resolves a SQL query and prepares it for execution on a Nozzle server. + /// Parses, validates and resolves a SQL query and prepares it for execution on a Amp server. /// /// # Errors /// @@ -96,7 +96,7 @@ impl Query { Ok(()) } - /// Resolves Subgraph-specific function calls in the SQL query. + /// Resolves subgraph-specific function calls in the SQL query. /// /// # Errors /// diff --git a/graph/src/nozzle/sql/query/resolve_event_signatures.rs b/graph/src/amp/sql/query/resolve_event_signatures.rs similarity index 98% rename from graph/src/nozzle/sql/query/resolve_event_signatures.rs rename to graph/src/amp/sql/query/resolve_event_signatures.rs index d67bc0263d8..c146e0e4050 100644 --- a/graph/src/nozzle/sql/query/resolve_event_signatures.rs +++ b/graph/src/amp/sql/query/resolve_event_signatures.rs @@ -4,7 +4,7 @@ use alloy::json_abi::JsonAbi; use anyhow::{bail, Context, Result}; use sqlparser_latest::ast::{self, visit_expressions_mut}; -use crate::nozzle::common::Ident; +use crate::amp::common::Ident; static FUNCTION_NAME: &str = "sg_event_signature"; diff --git a/graph/src/nozzle/sql/query/resolve_source_address.rs b/graph/src/amp/sql/query/resolve_source_address.rs similarity index 100% rename from graph/src/nozzle/sql/query/resolve_source_address.rs rename to graph/src/amp/sql/query/resolve_source_address.rs diff --git a/graph/src/nozzle/sql/query/validate_tables.rs b/graph/src/amp/sql/query/validate_tables.rs similarity index 98% rename from graph/src/nozzle/sql/query/validate_tables.rs rename to graph/src/amp/sql/query/validate_tables.rs index 429d60a4863..f9a156ea425 100644 --- a/graph/src/nozzle/sql/query/validate_tables.rs +++ b/graph/src/amp/sql/query/validate_tables.rs @@ -3,7 +3,7 @@ use std::ops::ControlFlow; use anyhow::{anyhow, bail, Error, Result}; use sqlparser_latest::ast::{self, Visit, Visitor}; -use crate::nozzle::common::Ident; +use crate::amp::common::Ident; /// Validates the dataset and tables used by the SQL query to ensure consistency with the explicitly declared ones. /// diff --git a/graph/src/nozzle/stream_aggregator/error.rs b/graph/src/amp/stream_aggregator/error.rs similarity index 96% rename from graph/src/nozzle/stream_aggregator/error.rs rename to graph/src/amp/stream_aggregator/error.rs index 187e1c48506..b58f3e24799 100644 --- a/graph/src/nozzle/stream_aggregator/error.rs +++ b/graph/src/amp/stream_aggregator/error.rs @@ -1,6 +1,6 @@ use thiserror::Error; -use crate::nozzle::error::IsDeterministic; +use crate::amp::error::IsDeterministic; #[derive(Debug, Error)] pub enum Error { diff --git a/graph/src/nozzle/stream_aggregator/mod.rs b/graph/src/amp/stream_aggregator/mod.rs similarity index 97% rename from graph/src/nozzle/stream_aggregator/mod.rs rename to graph/src/amp/stream_aggregator/mod.rs index 00583e423c1..1f5c558b77c 100644 --- a/graph/src/nozzle/stream_aggregator/mod.rs +++ b/graph/src/amp/stream_aggregator/mod.rs @@ -12,7 +12,7 @@ use futures03::{stream::BoxStream, Stream, StreamExt, TryStreamExt}; use slog::{debug, info, Logger}; use self::record_batch::Buffer; -use crate::nozzle::{client::ResponseBatch, error::IsDeterministic, log::Logger as _}; +use crate::amp::{client::ResponseBatch, error::IsDeterministic, log::Logger as _}; pub use self::{ error::Error, @@ -26,7 +26,7 @@ pub use self::{ /// is grouped and streamed in batches. /// /// The reason the aggregation is required is to ensure compatibility with the existing -/// Subgraph storage implementation. +/// subgraph storage implementation. /// /// # Stream requirements /// @@ -55,7 +55,7 @@ impl StreamAggregator { where E: std::error::Error + IsDeterministic + Send + Sync + 'static, { - let logger = logger.component("nozzle::StreamAggregator"); + let logger = logger.component("AmpStreamAggregator"); let streams = streams .into_iter() diff --git a/graph/src/nozzle/stream_aggregator/record_batch/aggregator.rs b/graph/src/amp/stream_aggregator/record_batch/aggregator.rs similarity index 100% rename from graph/src/nozzle/stream_aggregator/record_batch/aggregator.rs rename to graph/src/amp/stream_aggregator/record_batch/aggregator.rs diff --git a/graph/src/nozzle/stream_aggregator/record_batch/buffer.rs b/graph/src/amp/stream_aggregator/record_batch/buffer.rs similarity index 100% rename from graph/src/nozzle/stream_aggregator/record_batch/buffer.rs rename to graph/src/amp/stream_aggregator/record_batch/buffer.rs diff --git a/graph/src/nozzle/stream_aggregator/record_batch/decoder.rs b/graph/src/amp/stream_aggregator/record_batch/decoder.rs similarity index 98% rename from graph/src/nozzle/stream_aggregator/record_batch/decoder.rs rename to graph/src/amp/stream_aggregator/record_batch/decoder.rs index 26d056c623d..af94613b878 100644 --- a/graph/src/nozzle/stream_aggregator/record_batch/decoder.rs +++ b/graph/src/amp/stream_aggregator/record_batch/decoder.rs @@ -2,7 +2,7 @@ use alloy::primitives::{BlockHash, BlockNumber}; use anyhow::{anyhow, Result}; use arrow::array::RecordBatch; -use crate::nozzle::codec::{ +use crate::amp::codec::{ self, utils::{auto_block_hash_decoder, auto_block_number_decoder}, }; diff --git a/graph/src/amp/stream_aggregator/record_batch/group_data.rs b/graph/src/amp/stream_aggregator/record_batch/group_data.rs new file mode 100644 index 00000000000..32d3317c585 --- /dev/null +++ b/graph/src/amp/stream_aggregator/record_batch/group_data.rs @@ -0,0 +1,88 @@ +use std::sync::Arc; + +use anyhow::{Context, Result}; +use arrow::{ + array::{RecordBatch, UInt64Array}, + compute::{concat_batches, take_record_batch}, +}; + +/// Contains references to all record batches and rows of a group. +pub(super) struct GroupData { + parts: Vec, +} + +struct Part { + record_batch: Arc, + row_indices: Vec, +} + +impl GroupData { + /// Creates a new group with an initial `record_batch` and `row_index`. + pub(super) fn new(record_batch: Arc, row_index: usize) -> Self { + Self { + parts: vec![Part { + record_batch, + row_indices: vec![row_index as u64], + }], + } + } + + /// Adds a new `record_batch` and `row_index` to this group. + pub(super) fn add(&mut self, record_batch: Arc, row_index: usize) { + self.parts.push(Part { + record_batch, + row_indices: vec![row_index as u64], + }) + } + + /// Adds a `row_index` to the most recent record batch in this group. + /// + /// # Panics + /// + /// Panics if this group is empty. + pub(super) fn add_row_index(&mut self, row_index: usize) { + assert!(!self.parts.is_empty()); + + self.parts + .last_mut() + .unwrap() + .row_indices + .push(row_index as u64); + } + + /// Converts this group into a single record batch. + /// + /// Merges all group rows from all record batches together. + /// + /// # Errors + /// + /// Returns an error if the record batches in this group have incompatible types. + /// + /// The returned error is deterministic. + /// + /// # Panics + /// + /// Panics if: + /// - This group is empty + /// - This group contains invalid row indices + pub(super) fn into_record_batch(self) -> Result { + assert!(!self.parts.is_empty()); + + let schema = self.parts[0].record_batch.schema(); + let mut partial_record_batches = Vec::with_capacity(self.parts.len()); + + for part in self.parts { + let Part { + record_batch, + row_indices, + } = part; + + let row_indices = UInt64Array::from(row_indices); + let partial_record_batch = take_record_batch(&record_batch, &row_indices).unwrap(); + + partial_record_batches.push(partial_record_batch); + } + + concat_batches(&schema, &partial_record_batches).context("failed to merge record batches") + } +} diff --git a/graph/src/nozzle/stream_aggregator/record_batch/mod.rs b/graph/src/amp/stream_aggregator/record_batch/mod.rs similarity index 100% rename from graph/src/nozzle/stream_aggregator/record_batch/mod.rs rename to graph/src/amp/stream_aggregator/record_batch/mod.rs diff --git a/graph/src/blockchain/mod.rs b/graph/src/blockchain/mod.rs index cd22acb8a69..1346213b879 100644 --- a/graph/src/blockchain/mod.rs +++ b/graph/src/blockchain/mod.rs @@ -595,7 +595,7 @@ impl FromStr for BlockchainKind { "near" => Ok(BlockchainKind::Near), "substreams" => Ok(BlockchainKind::Substreams), "subgraph" => Ok(BlockchainKind::Ethereum), // TODO(krishna): We should detect the blockchain kind from the source subgraph - "nozzle" => Ok(BlockchainKind::Ethereum), // TODO: Maybe get this from the Nozzle server + "amp" => Ok(BlockchainKind::Ethereum), // TODO: Maybe get this from the Amp server _ => Err(anyhow!("unknown blockchain kind {}", s)), } } diff --git a/graph/src/components/store/err.rs b/graph/src/components/store/err.rs index 627320bdc76..d59a835d57b 100644 --- a/graph/src/components/store/err.rs +++ b/graph/src/components/store/err.rs @@ -248,7 +248,7 @@ impl From for StoreError { } } -impl crate::nozzle::error::IsDeterministic for StoreError { +impl crate::amp::error::IsDeterministic for StoreError { fn is_deterministic(&self) -> bool { StoreError::is_deterministic(self) } diff --git a/graph/src/data/subgraph/mod.rs b/graph/src/data/subgraph/mod.rs index 3225b10bca8..f0baac7766b 100644 --- a/graph/src/data/subgraph/mod.rs +++ b/graph/src/data/subgraph/mod.rs @@ -32,7 +32,7 @@ use wasmparser; use web3::types::Address; use crate::{ - bail, + amp, bail, blockchain::{BlockPtr, Blockchain}, components::{ link_resolver::{LinkResolver, LinkResolverContext}, @@ -47,7 +47,7 @@ use crate::{ UnresolvedDataSourceTemplate, }, derive::CacheWeight, - ensure, nozzle, + ensure, prelude::{r, Value, ENV_VARS}, schema::{InputSchema, SchemaValidationError}, }; @@ -363,8 +363,8 @@ pub enum SubgraphManifestValidationError { FeatureValidationError(#[from] SubgraphFeatureValidationError), #[error("data source {0} is invalid: {1}")] DataSourceValidation(String, Error), - #[error("failed to validate Nozzle subgraph: {0:#}")] - Nozzle(#[source] Error), + #[error("failed to validate Amp subgraph: {0:#}")] + Amp(#[source] Error), } #[derive(Error, Debug)] @@ -804,11 +804,11 @@ impl UnvalidatedSubgraphManifest { /// Entry point for resolving a subgraph definition. /// Right now the only supported links are of the form: /// `/ipfs/QmUmg7BZC1YP1ca66rRtWKxpXp77WgVHrnv263JtDuvs2k` - pub async fn resolve( + pub async fn resolve( id: DeploymentHash, raw: serde_yaml::Mapping, resolver: &Arc, - nozzle_client: Option>, + amp_client: Option>, logger: &Logger, max_spec_version: semver::Version, ) -> Result { @@ -817,7 +817,7 @@ impl UnvalidatedSubgraphManifest { id, raw, resolver, - nozzle_client, + amp_client, logger, max_spec_version, ) @@ -886,7 +886,7 @@ impl UnvalidatedSubgraphManifest { &self.0.spec_version, )); - errors.append(&mut Self::validate_nozzle_subgraph(&self.0)); + errors.append(&mut Self::validate_amp_subgraph(&self.0)); match errors.is_empty() { true => Ok(self.0), @@ -898,10 +898,10 @@ impl UnvalidatedSubgraphManifest { &self.0.spec_version } - fn validate_nozzle_subgraph( + fn validate_amp_subgraph( manifest: &SubgraphManifest, ) -> Vec { - use api_version::SPEC_VERSION_1_4_0; + use api_version::SPEC_VERSION_1_5_0; let BaseSubgraphManifest { id: _, @@ -917,27 +917,27 @@ impl UnvalidatedSubgraphManifest { indexer_hints: _, } = manifest; - let nozzle_data_sources = data_sources + let amp_data_sources = data_sources .iter() .filter_map(|data_source| match data_source { - DataSource::Nozzle(nozzle_data_source) => Some(nozzle_data_source), + DataSource::Amp(amp_data_source) => Some(amp_data_source), _ => None, }) .collect_vec(); - if nozzle_data_sources.is_empty() { - // Not a Nozzle subgraph + if amp_data_sources.is_empty() { + // Not a Amp subgraph return Vec::new(); } let mut errors = Vec::new(); - let err = |msg: &str| SubgraphManifestValidationError::Nozzle(anyhow!(msg.to_owned())); + let err = |msg: &str| SubgraphManifestValidationError::Amp(anyhow!(msg.to_owned())); - if data_sources.len() != nozzle_data_sources.len() { + if data_sources.len() != amp_data_sources.len() { errors.push(err("multiple data source kinds are not supported")); } - if *spec_version < SPEC_VERSION_1_4_0 { + if *spec_version < SPEC_VERSION_1_5_0 { errors.push(err("spec version is not supported")); } @@ -959,17 +959,17 @@ impl UnvalidatedSubgraphManifest { impl SubgraphManifest { /// Entry point for resolving a subgraph definition. - pub async fn resolve_from_raw( + pub async fn resolve_from_raw( id: DeploymentHash, raw: serde_yaml::Mapping, resolver: &Arc, - nozzle_client: Option>, + amp_client: Option>, logger: &Logger, max_spec_version: semver::Version, ) -> Result { let unresolved = UnresolvedSubgraphManifest::parse(id.cheap_clone(), raw)?; let resolved = unresolved - .resolve(&id, resolver, nozzle_client, logger, max_spec_version) + .resolve(&id, resolver, amp_client, logger, max_spec_version) .await?; Ok(resolved) } @@ -1105,11 +1105,11 @@ impl UnresolvedSubgraphManifest { serde_yaml::from_value(raw.into()).map_err(Into::into) } - pub async fn resolve( + pub async fn resolve( self, deployment_hash: &DeploymentHash, resolver: &Arc, - nozzle_client: Option>, + amp_client: Option>, logger: &Logger, max_spec_version: semver::Version, ) -> Result, SubgraphManifestResolveError> { @@ -1147,7 +1147,7 @@ impl UnresolvedSubgraphManifest { ds.resolve( deployment_hash, resolver, - nozzle_client.cheap_clone(), + amp_client.cheap_clone(), logger, idx as u32, &spec_version, @@ -1155,10 +1155,10 @@ impl UnresolvedSubgraphManifest { })) .await?; - let nozzle_data_sources = data_sources + let amp_data_sources = data_sources .iter() .filter_map(|data_source| match data_source { - DataSource::Nozzle(nozzle_data_source) => Some(nozzle_data_source), + DataSource::Amp(amp_data_source) => Some(amp_data_source), _ => None, }) .collect_vec(); @@ -1175,8 +1175,8 @@ impl UnresolvedSubgraphManifest { ) .await? } - None if nozzle_data_sources.len() == data_sources.len() => { - let table_schemas = nozzle_data_sources + None if amp_data_sources.len() == data_sources.len() => { + let table_schemas = amp_data_sources .iter() .map(|data_source| { data_source @@ -1187,7 +1187,7 @@ impl UnresolvedSubgraphManifest { }) .flatten(); - nozzle::schema::generate_subgraph_schema(&id, table_schemas)? + amp::schema::generate_subgraph_schema(&id, table_schemas)? } None => { return Err(anyhow!("subgraph schema is required").into()); @@ -1292,7 +1292,7 @@ impl UnresolvedSubgraphManifest { indexer_hints, }; - if let Some(e) = UnvalidatedSubgraphManifest::::validate_nozzle_subgraph(&manifest) + if let Some(e) = UnvalidatedSubgraphManifest::::validate_amp_subgraph(&manifest) .into_iter() .next() { diff --git a/graph/src/data_source/mod.rs b/graph/src/data_source/mod.rs index 4c1addbc2a6..e1598e2f0df 100644 --- a/graph/src/data_source/mod.rs +++ b/graph/src/data_source/mod.rs @@ -36,14 +36,14 @@ use std::{ }; use thiserror::Error; -use crate::nozzle; +use crate::amp; #[derive(Debug)] pub enum DataSource { Onchain(C::DataSource), Offchain(offchain::DataSource), Subgraph(subgraph::DataSource), - Nozzle(nozzle::manifest::DataSource), + Amp(amp::manifest::DataSource), } #[derive(Error, Debug)] @@ -99,7 +99,7 @@ impl DataSource { Self::Onchain(ds) => Some(ds), Self::Offchain(_) => None, Self::Subgraph(_) => None, - Self::Nozzle(_) => None, + Self::Amp(_) => None, } } @@ -108,7 +108,7 @@ impl DataSource { Self::Onchain(_) => None, Self::Offchain(_) => None, Self::Subgraph(ds) => Some(ds), - Self::Nozzle(_) => None, + Self::Amp(_) => None, } } @@ -117,7 +117,7 @@ impl DataSource { Self::Onchain(_) => true, Self::Offchain(_) => false, Self::Subgraph(_) => true, - Self::Nozzle(_) => true, + Self::Amp(_) => true, } } @@ -126,7 +126,7 @@ impl DataSource { Self::Onchain(_) => None, Self::Offchain(ds) => Some(ds), Self::Subgraph(_) => None, - Self::Nozzle(_) => None, + Self::Amp(_) => None, } } @@ -135,7 +135,7 @@ impl DataSource { DataSourceEnum::Onchain(ds) => ds.network(), DataSourceEnum::Offchain(_) => None, DataSourceEnum::Subgraph(ds) => ds.network(), - Self::Nozzle(ds) => Some(&ds.network), + Self::Amp(ds) => Some(&ds.network), } } @@ -144,7 +144,7 @@ impl DataSource { DataSourceEnum::Onchain(ds) => Some(ds.start_block()), DataSourceEnum::Offchain(_) => None, DataSourceEnum::Subgraph(ds) => Some(ds.source.start_block), - Self::Nozzle(ds) => Some(ds.source.start_block as i32), + Self::Amp(ds) => Some(ds.source.start_block as i32), } } @@ -161,7 +161,7 @@ impl DataSource { Self::Onchain(ds) => ds.address().map(ToOwned::to_owned), Self::Offchain(ds) => ds.address(), Self::Subgraph(ds) => ds.address(), - Self::Nozzle(ds) => Some(ds.source.address.to_vec()), + Self::Amp(ds) => Some(ds.source.address.to_vec()), } } @@ -170,7 +170,7 @@ impl DataSource { Self::Onchain(ds) => ds.name(), Self::Offchain(ds) => &ds.name, Self::Subgraph(ds) => &ds.name, - Self::Nozzle(ds) => ds.name.as_str(), + Self::Amp(ds) => ds.name.as_str(), } } @@ -179,7 +179,7 @@ impl DataSource { Self::Onchain(ds) => ds.kind().to_owned(), Self::Offchain(ds) => ds.kind.to_string(), Self::Subgraph(ds) => ds.kind.clone(), - Self::Nozzle(_) => nozzle::manifest::DataSource::KIND.to_string(), + Self::Amp(_) => amp::manifest::DataSource::KIND.to_string(), } } @@ -188,7 +188,7 @@ impl DataSource { Self::Onchain(ds) => ds.min_spec_version(), Self::Offchain(ds) => ds.min_spec_version(), Self::Subgraph(ds) => ds.min_spec_version(), - Self::Nozzle(_) => nozzle::manifest::DataSource::MIN_SPEC_VERSION, + Self::Amp(_) => amp::manifest::DataSource::MIN_SPEC_VERSION, } } @@ -197,7 +197,7 @@ impl DataSource { Self::Onchain(ds) => ds.end_block(), Self::Offchain(_) => None, Self::Subgraph(_) => None, - Self::Nozzle(ds) => Some(ds.source.end_block as i32), + Self::Amp(ds) => Some(ds.source.end_block as i32), } } @@ -206,7 +206,7 @@ impl DataSource { Self::Onchain(ds) => ds.creation_block(), Self::Offchain(ds) => ds.creation_block, Self::Subgraph(ds) => ds.creation_block, - Self::Nozzle(_) => None, + Self::Amp(_) => None, } } @@ -215,7 +215,7 @@ impl DataSource { Self::Onchain(ds) => ds.context(), Self::Offchain(ds) => ds.context.clone(), Self::Subgraph(ds) => ds.context.clone(), - Self::Nozzle(_) => Arc::new(None), + Self::Amp(_) => Arc::new(None), } } @@ -224,7 +224,7 @@ impl DataSource { Self::Onchain(ds) => ds.api_version(), Self::Offchain(ds) => ds.mapping.api_version.clone(), Self::Subgraph(ds) => ds.mapping.api_version.clone(), - Self::Nozzle(ds) => ds.transformer.api_version.clone(), + Self::Amp(ds) => ds.transformer.api_version.clone(), } } @@ -233,7 +233,7 @@ impl DataSource { Self::Onchain(ds) => ds.runtime(), Self::Offchain(ds) => Some(ds.mapping.runtime.cheap_clone()), Self::Subgraph(ds) => Some(ds.mapping.runtime.cheap_clone()), - Self::Nozzle(_) => None, + Self::Amp(_) => None, } } @@ -244,7 +244,7 @@ impl DataSource { Self::Onchain(_) => EntityTypeAccess::Any, Self::Offchain(ds) => EntityTypeAccess::Restriced(ds.mapping.entities.clone()), Self::Subgraph(_) => EntityTypeAccess::Any, - Self::Nozzle(_) => EntityTypeAccess::Any, + Self::Amp(_) => EntityTypeAccess::Any, } } @@ -253,7 +253,7 @@ impl DataSource { Self::Onchain(ds) => ds.handler_kinds(), Self::Offchain(ds) => vec![ds.handler_kind()].into_iter().collect(), Self::Subgraph(ds) => vec![ds.handler_kind()].into_iter().collect(), - Self::Nozzle(_) => HashSet::new(), + Self::Amp(_) => HashSet::new(), } } @@ -262,7 +262,7 @@ impl DataSource { Self::Onchain(ds) => ds.has_declared_calls(), Self::Offchain(_) => false, Self::Subgraph(_) => false, - Self::Nozzle(_) => false, + Self::Amp(_) => false, } } @@ -289,7 +289,7 @@ impl DataSource { | (Self::Offchain(_), TriggerData::Subgraph(_)) | (Self::Subgraph(_), TriggerData::Onchain(_)) | (Self::Subgraph(_), TriggerData::Offchain(_)) => Ok(None), - (Self::Nozzle(_), _) => Ok(None), + (Self::Amp(_), _) => Ok(None), } } @@ -306,7 +306,7 @@ impl DataSource { Self::Onchain(ds) => ds.as_stored_dynamic_data_source(), Self::Offchain(ds) => ds.as_stored_dynamic_data_source(), Self::Subgraph(_) => todo!(), // TODO(krishna) - Self::Nozzle(_) => unreachable!(), + Self::Amp(_) => unreachable!(), } } @@ -332,7 +332,7 @@ impl DataSource { Self::Onchain(ds) => ds.validate(spec_version), Self::Offchain(_) => vec![], Self::Subgraph(_) => vec![], // TODO(krishna) - Self::Nozzle(_) => Vec::new(), + Self::Amp(_) => Vec::new(), } } @@ -341,7 +341,7 @@ impl DataSource { Self::Onchain(_) => CausalityRegion::ONCHAIN, Self::Offchain(ds) => ds.causality_region, Self::Subgraph(_) => CausalityRegion::ONCHAIN, - Self::Nozzle(_) => CausalityRegion::ONCHAIN, + Self::Amp(_) => CausalityRegion::ONCHAIN, } } } @@ -351,15 +351,15 @@ pub enum UnresolvedDataSource { Onchain(C::UnresolvedDataSource), Offchain(offchain::UnresolvedDataSource), Subgraph(subgraph::UnresolvedDataSource), - Nozzle(nozzle::manifest::data_source::RawDataSource), + Amp(amp::manifest::data_source::RawDataSource), } impl UnresolvedDataSource { - pub async fn resolve( + pub async fn resolve( self, deployment_hash: &DeploymentHash, resolver: &Arc, - nozzle_client: Option>, + amp_client: Option>, logger: &Logger, manifest_idx: u32, spec_version: &semver::Version, @@ -376,10 +376,10 @@ impl UnresolvedDataSource { .await .map(DataSource::Onchain), Self::Subgraph(unresolved) => unresolved - .resolve::( + .resolve::( deployment_hash, resolver, - nozzle_client, + amp_client, logger, manifest_idx, spec_version, @@ -392,13 +392,13 @@ impl UnresolvedDataSource { for details see https://github.com/graphprotocol/graph-node/issues/3864" ); } - Self::Nozzle(raw_data_source) => match nozzle_client { - Some(nozzle_client) => raw_data_source - .resolve(logger, resolver.as_ref(), nozzle_client.as_ref()) + Self::Amp(raw_data_source) => match amp_client { + Some(amp_client) => raw_data_source + .resolve(logger, resolver.as_ref(), amp_client.as_ref()) .await - .map(DataSource::Nozzle) + .map(DataSource::Amp) .map_err(Error::from), - None => Err(anyhow!("support for Nozzle data sources is not enabled")), + None => Err(anyhow!("support for Amp data sources is not enabled")), }, } .with_context(|| format!("failed to resolve data source at index {manifest_idx}")) @@ -667,7 +667,7 @@ impl Clone for DataSource { Self::Onchain(ds) => Self::Onchain(ds.clone()), Self::Offchain(ds) => Self::Offchain(ds.clone()), Self::Subgraph(ds) => Self::Subgraph(ds.clone()), - Self::Nozzle(ds) => Self::Nozzle(ds.clone()), + Self::Amp(ds) => Self::Amp(ds.clone()), } } } @@ -703,9 +703,9 @@ impl<'de, C: Blockchain> Deserialize<'de> for UnresolvedDataSource { subgraph::UnresolvedDataSource::deserialize(map.into_deserializer()) .map_err(serde::de::Error::custom) .map(UnresolvedDataSource::Subgraph) - } else if nozzle::manifest::DataSource::KIND == kind { - nozzle::manifest::data_source::RawDataSource::deserialize(map.into_deserializer()) - .map(UnresolvedDataSource::Nozzle) + } else if amp::manifest::DataSource::KIND == kind { + amp::manifest::data_source::RawDataSource::deserialize(map.into_deserializer()) + .map(UnresolvedDataSource::Amp) .map_err(serde::de::Error::custom) } else if (&C::KIND.to_string() == kind) || C::ALIASES.contains(&kind) { C::UnresolvedDataSource::deserialize(map.into_deserializer()) diff --git a/graph/src/data_source/subgraph.rs b/graph/src/data_source/subgraph.rs index 30784690634..c9f01cf4890 100644 --- a/graph/src/data_source/subgraph.rs +++ b/graph/src/data_source/subgraph.rs @@ -28,7 +28,7 @@ use super::{ }, DataSourceTemplateInfo, TriggerWithHandler, }; -use crate::nozzle; +use crate::amp; pub const SUBGRAPH_DS_KIND: &str = "subgraph"; @@ -283,11 +283,11 @@ impl UnresolvedDataSource { Ok(()) } - async fn resolve_source_manifest( + async fn resolve_source_manifest( &self, deployment_hash: &DeploymentHash, resolver: &Arc, - nozzle_client: Option>, + amp_client: Option>, logger: &Logger, ) -> Result>, Error> { let resolver: Arc = @@ -324,7 +324,7 @@ impl UnresolvedDataSource { .resolve( &deployment_hash, &resolver, - nozzle_client, + amp_client, logger, LATEST_VERSION.clone(), ) @@ -337,10 +337,10 @@ impl UnresolvedDataSource { } /// Recursively verifies that all grafts in the chain meet the minimum spec version requirement for a subgraph source - async fn verify_graft_chain_sourcable( + async fn verify_graft_chain_sourcable( manifest: Arc>, resolver: &Arc, - nozzle_client: Option>, + amp_client: Option>, logger: &Logger, graft_chain: &mut Vec, ) -> Result<(), Error> { @@ -376,7 +376,7 @@ impl UnresolvedDataSource { .resolve( &manifest.id, resolver, - nozzle_client.cheap_clone(), + amp_client.cheap_clone(), logger, LATEST_VERSION.clone(), ) @@ -386,7 +386,7 @@ impl UnresolvedDataSource { Box::pin(Self::verify_graft_chain_sourcable( Arc::new(graft_manifest), resolver, - nozzle_client, + amp_client, logger, graft_chain, )) @@ -397,11 +397,11 @@ impl UnresolvedDataSource { } #[allow(dead_code)] - pub(super) async fn resolve( + pub(super) async fn resolve( self, deployment_hash: &DeploymentHash, resolver: &Arc, - nozzle_client: Option>, + amp_client: Option>, logger: &Logger, manifest_idx: u32, spec_version: &semver::Version, @@ -414,10 +414,10 @@ impl UnresolvedDataSource { let kind = self.kind.clone(); let source_manifest = self - .resolve_source_manifest::( + .resolve_source_manifest::( deployment_hash, resolver, - nozzle_client.cheap_clone(), + amp_client.cheap_clone(), logger, ) .await?; @@ -436,7 +436,7 @@ impl UnresolvedDataSource { Self::verify_graft_chain_sourcable( source_manifest.clone(), resolver, - nozzle_client, + amp_client, logger, &mut graft_chain, ) diff --git a/graph/src/env/nozzle.rs b/graph/src/env/amp.rs similarity index 82% rename from graph/src/env/nozzle.rs rename to graph/src/env/amp.rs index a64a691d447..909db4134ad 100644 --- a/graph/src/env/nozzle.rs +++ b/graph/src/env/amp.rs @@ -1,8 +1,8 @@ use std::time::Duration; -/// Contains environment variables related to Nozzle subgraphs. +/// Contains environment variables related to Amp subgraphs. #[derive(Debug)] -pub struct NozzleEnv { +pub struct AmpEnv { /// Maximum number of record batches to buffer in memory per stream for each SQL query. /// This is the maximum number of record batches that can be output by a single block. /// @@ -10,23 +10,23 @@ pub struct NozzleEnv { pub max_buffer_size: usize, /// Maximum number of blocks to request per stream for each SQL query. - /// Limiting this value reduces load on the Nozzle server when processing heavy queries. + /// Limiting this value reduces load on the Amp server when processing heavy queries. /// /// Defaults to `2,000,000`. pub max_block_range: usize, - /// Minimum time to wait before retrying a failed SQL query to the Nozzle server. + /// Minimum time to wait before retrying a failed SQL query to the Amp server. /// /// Defaults to `1` second. pub query_retry_min_delay: Duration, - /// Maximum time to wait before retrying a failed SQL query to the Nozzle server. + /// Maximum time to wait before retrying a failed SQL query to the Amp server. /// /// Defaults to `600` seconds. pub query_retry_max_delay: Duration, } -impl NozzleEnv { +impl AmpEnv { const DEFAULT_MAX_BUFFER_SIZE: usize = 1_000; const DEFAULT_MAX_BLOCK_RANGE: usize = 2_000_000; const DEFAULT_QUERY_RETRY_MIN_DELAY: Duration = Duration::from_secs(1); @@ -35,7 +35,7 @@ impl NozzleEnv { pub(super) fn new(raw_env: &super::Inner) -> Self { Self { max_buffer_size: raw_env - .nozzle_max_buffer_size + .amp_max_buffer_size .and_then(|value| { if value == 0 { return None; @@ -44,7 +44,7 @@ impl NozzleEnv { }) .unwrap_or(Self::DEFAULT_MAX_BUFFER_SIZE), max_block_range: raw_env - .nozzle_max_block_range + .amp_max_block_range .and_then(|mut value| { if value == 0 { value = usize::MAX; @@ -53,11 +53,11 @@ impl NozzleEnv { }) .unwrap_or(Self::DEFAULT_MAX_BLOCK_RANGE), query_retry_min_delay: raw_env - .nozzle_query_retry_min_delay_seconds + .amp_query_retry_min_delay_seconds .map(Duration::from_secs) .unwrap_or(Self::DEFAULT_QUERY_RETRY_MIN_DELAY), query_retry_max_delay: raw_env - .nozzle_query_retry_max_delay_seconds + .amp_query_retry_max_delay_seconds .map(Duration::from_secs) .unwrap_or(Self::DEFAULT_QUERY_RETRY_MAX_DELAY), } diff --git a/graph/src/env/mod.rs b/graph/src/env/mod.rs index 240c0423757..00624c5ab75 100644 --- a/graph/src/env/mod.rs +++ b/graph/src/env/mod.rs @@ -1,6 +1,6 @@ +mod amp; mod graphql; mod mappings; -mod nozzle; mod store; use std::{collections::HashSet, env::VarError, fmt, str::FromStr, sync::Arc, time::Duration}; @@ -17,7 +17,7 @@ use crate::{ runtime::gas::CONST_MAX_GAS_PER_HANDLER, }; -pub use self::nozzle::NozzleEnv; +pub use self::amp::AmpEnv; #[cfg(debug_assertions)] use std::sync::Mutex; @@ -54,7 +54,7 @@ pub struct EnvVars { pub graphql: EnvVarsGraphQl, pub mappings: EnvVarsMapping, pub store: EnvVarsStore, - pub nozzle: Arc, + pub amp: Arc, /// Enables query throttling when getting database connections goes over this value. /// Load management can be disabled by setting this to 0. @@ -301,7 +301,7 @@ impl EnvVars { graphql, mappings: mapping_handlers, store, - nozzle: Arc::new(NozzleEnv::new(&inner)), + amp: Arc::new(AmpEnv::new(&inner)), load_threshold: Duration::from_millis(inner.load_threshold_in_ms), load_jail_threshold: inner.load_jail_threshold, @@ -594,14 +594,14 @@ struct Inner { )] disable_deployment_hash_validation: EnvVarBoolean, - #[envconfig(from = "GRAPH_NOZZLE_MAX_BUFFER_SIZE")] - nozzle_max_buffer_size: Option, - #[envconfig(from = "GRAPH_NOZZLE_MAX_BLOCK_RANGE")] - nozzle_max_block_range: Option, - #[envconfig(from = "GRAPH_NOZZLE_QUERY_RETRY_MIN_DELAY_SECONDS")] - nozzle_query_retry_min_delay_seconds: Option, - #[envconfig(from = "GRAPH_NOZZLE_QUERY_RETRY_MAX_DELAY_SECONDS")] - nozzle_query_retry_max_delay_seconds: Option, + #[envconfig(from = "GRAPH_AMP_MAX_BUFFER_SIZE")] + amp_max_buffer_size: Option, + #[envconfig(from = "GRAPH_AMP_MAX_BLOCK_RANGE")] + amp_max_block_range: Option, + #[envconfig(from = "GRAPH_AMP_QUERY_RETRY_MIN_DELAY_SECONDS")] + amp_query_retry_min_delay_seconds: Option, + #[envconfig(from = "GRAPH_AMP_QUERY_RETRY_MAX_DELAY_SECONDS")] + amp_query_retry_max_delay_seconds: Option, } #[derive(Clone, Debug)] diff --git a/graph/src/lib.rs b/graph/src/lib.rs index 7095cede7a1..cdc50d0f4e2 100644 --- a/graph/src/lib.rs +++ b/graph/src/lib.rs @@ -37,7 +37,7 @@ pub mod env; pub mod ipfs; -pub mod nozzle; +pub mod amp; /// Wrapper for spawning tasks that abort on panic, which is our default. mod task_spawn; diff --git a/node/src/bin/manager.rs b/node/src/bin/manager.rs index a6d881a8747..a35e543a5a4 100644 --- a/node/src/bin/manager.rs +++ b/node/src/bin/manager.rs @@ -109,10 +109,10 @@ pub struct Opt { #[clap( long, value_name = "{HOST:PORT|URL}", - env = "GRAPH_NOZZLE_FLIGHT_SERVICE_ADDRESS", - help = "The address of the Nozzle Flight gRPC service" + env = "GRAPH_AMP_FLIGHT_SERVICE_ADDRESS", + help = "The address of the Amp Flight gRPC service" )] - pub nozzle_flight_service_address: Option, + pub amp_flight_service_address: Option, #[clap(subcommand)] pub cmd: Command, @@ -1340,7 +1340,7 @@ async fn main() -> anyhow::Result<()> { network_name, ipfs_url, arweave_url, - opt.nozzle_flight_service_address.clone(), + opt.amp_flight_service_address.clone(), config, metrics_ctx, node_id, diff --git a/node/src/launcher.rs b/node/src/launcher.rs index 1cb6558db1b..8195f6fd7fb 100644 --- a/node/src/launcher.rs +++ b/node/src/launcher.rs @@ -16,8 +16,8 @@ use graph::prelude::*; use graph::prometheus::Registry; use graph::url::Url; use graph::{ + amp, blockchain::{Blockchain, BlockchainKind, BlockchainMap}, - nozzle, }; use graph_core::polling_monitor::{arweave_service, ArweaveService, IpfsService}; use graph_graphql::prelude::GraphQlRunner; @@ -258,7 +258,7 @@ fn deploy_subgraph_from_flag( ); } -fn build_subgraph_registrar( +fn build_subgraph_registrar( metrics_registry: Arc, network_store: &Arc, logger_factory: &LoggerFactory, @@ -270,18 +270,18 @@ fn build_subgraph_registrar( subscription_manager: Arc, arweave_service: ArweaveService, ipfs_service: IpfsService, - nozzle_client: Option>, + amp_client: Option>, cancel_token: CancellationToken, ) -> Arc< graph_core::subgraph::SubgraphRegistrar< graph_core::subgraph_provider::SubgraphProvider, SubgraphStore, SubscriptionManager, - NC, + AC, >, > where - NC: nozzle::Client + Send + Sync + 'static, + AC: amp::Client + Send + Sync + 'static, { let static_filters = ENV_VARS.experimental_static_filters; let sg_count = Arc::new(SubgraphCountMetric::new(metrics_registry.cheap_clone())); @@ -289,20 +289,20 @@ where let mut subgraph_instance_managers = graph_core::subgraph_provider::SubgraphInstanceManagers::new(); - if let Some(nozzle_client) = nozzle_client.cheap_clone() { - let nozzle_instance_manager = graph_core::nozzle_subgraph::Manager::new( + if let Some(amp_client) = amp_client.cheap_clone() { + let amp_instance_manager = graph_core::amp_subgraph::Manager::new( &logger_factory, metrics_registry.cheap_clone(), env_vars.cheap_clone(), &cancel_token, network_store.subgraph_store(), link_resolver.cheap_clone(), - nozzle_client, + amp_client, ); subgraph_instance_managers.add( graph_core::subgraph_provider::SubgraphProcessingKind::Amp, - Arc::new(nozzle_instance_manager), + Arc::new(amp_instance_manager), ); } @@ -316,7 +316,7 @@ where link_resolver.clone(), ipfs_service, arweave_service, - nozzle_client.cheap_clone(), + amp_client.cheap_clone(), static_filters, ); @@ -343,7 +343,7 @@ where Arc::new(subgraph_provider), network_store.subgraph_store(), subscription_manager, - nozzle_client, + amp_client, blockchain_map, node_id.clone(), version_switching_mode, @@ -499,17 +499,17 @@ pub async fn run( &logger_factory, ); - let nozzle_client = match opt.nozzle_flight_service_address.as_deref() { - Some(nozzle_flight_service_address) => { - let addr = nozzle_flight_service_address + let amp_client = match opt.amp_flight_service_address.as_deref() { + Some(amp_flight_service_address) => { + let addr = amp_flight_service_address .parse() - .expect("Invalid Nozzle Flight service address"); + .expect("Invalid Amp Flight service address"); - let nozzle_client = nozzle::FlightClient::new(addr) + let amp_client = amp::FlightClient::new(addr) .await - .expect("Failed to connect to Nozzle Flight service"); + .expect("Failed to connect to Amp Flight service"); - Some(Arc::new(nozzle_client)) + Some(Arc::new(amp_client)) } None => None, }; @@ -548,7 +548,7 @@ pub async fn run( blockchain_map.clone(), network_store.clone(), link_resolver.clone(), - nozzle_client.cheap_clone(), + amp_client.cheap_clone(), ); if !opt.disable_block_ingestor { @@ -574,7 +574,7 @@ pub async fn run( subscription_manager, arweave_service, ipfs_service, - nozzle_client, + amp_client, cancel_token, ); diff --git a/node/src/manager/commands/run.rs b/node/src/manager/commands/run.rs index 9dd8f1bc39c..bd80dedea0e 100644 --- a/node/src/manager/commands/run.rs +++ b/node/src/manager/commands/run.rs @@ -7,6 +7,7 @@ use crate::manager::PanicSubscriptionManager; use crate::network_setup::Networks; use crate::store_builder::StoreBuilder; use crate::MetricsContext; +use graph::amp; use graph::anyhow::bail; use graph::cheap_clone::CheapClone; use graph::components::link_resolver::{ArweaveClient, FileSizeLimit}; @@ -15,7 +16,6 @@ use graph::components::store::DeploymentLocator; use graph::components::subgraph::{Settings, SubgraphInstanceManager as _}; use graph::endpoint::EndpointMetrics; use graph::env::EnvVars; -use graph::nozzle; use graph::prelude::{ anyhow, tokio, BlockNumber, DeploymentHash, IpfsResolver, LoggerFactory, NodeId, SubgraphCountMetric, SubgraphName, SubgraphRegistrar, SubgraphStore, @@ -40,7 +40,7 @@ pub async fn run( _network_name: String, ipfs_url: Vec, arweave_url: String, - nozzle_flight_service_address: Option, + amp_flight_service_address: Option, config: Config, metrics_ctx: MetricsContext, node_id: NodeId, @@ -144,34 +144,34 @@ pub async fn run( let mut subgraph_instance_managers = graph_core::subgraph_provider::SubgraphInstanceManagers::new(); - let nozzle_client = match nozzle_flight_service_address { - Some(nozzle_flight_service_address) => { - let addr = nozzle_flight_service_address + let amp_client = match amp_flight_service_address { + Some(amp_flight_service_address) => { + let addr = amp_flight_service_address .parse() - .expect("Invalid Nozzle Flight service address"); + .expect("Invalid Amp Flight service address"); - let nozzle_client = Arc::new( - nozzle::FlightClient::new(addr) + let amp_client = Arc::new( + amp::FlightClient::new(addr) .await - .expect("Failed to connect to Nozzle Flight service"), + .expect("Failed to connect to Amp Flight service"), ); - let nozzle_instance_manager = graph_core::nozzle_subgraph::Manager::new( + let amp_instance_manager = graph_core::amp_subgraph::Manager::new( &logger_factory, metrics_registry.cheap_clone(), env_vars.cheap_clone(), &cancel_token, network_store.subgraph_store(), link_resolver.cheap_clone(), - nozzle_client.cheap_clone(), + amp_client.cheap_clone(), ); subgraph_instance_managers.add( graph_core::subgraph_provider::SubgraphProcessingKind::Amp, - Arc::new(nozzle_instance_manager), + Arc::new(amp_instance_manager), ); - Some(nozzle_client) + Some(amp_client) } None => None, }; @@ -186,7 +186,7 @@ pub async fn run( link_resolver.cheap_clone(), ipfs_service, arweave_service, - nozzle_client.cheap_clone(), + amp_client.cheap_clone(), static_filters, ); @@ -211,7 +211,7 @@ pub async fn run( subgraph_provider.cheap_clone(), subgraph_store.clone(), panicking_subscription_manager, - nozzle_client, + amp_client, blockchain_map, node_id.clone(), SubgraphVersionSwitchingMode::Instant, diff --git a/node/src/opt.rs b/node/src/opt.rs index f906de72ac0..3708a7da493 100644 --- a/node/src/opt.rs +++ b/node/src/opt.rs @@ -234,10 +234,10 @@ pub struct Opt { #[clap( long, value_name = "{HOST:PORT|URL}", - env = "GRAPH_NOZZLE_FLIGHT_SERVICE_ADDRESS", - help = "The address of the Nozzle Flight gRPC service" + env = "GRAPH_AMP_FLIGHT_SERVICE_ADDRESS", + help = "The address of the Amp Flight gRPC service" )] - pub nozzle_flight_service_address: Option, + pub amp_flight_service_address: Option, } impl From for config::Opt { diff --git a/runtime/wasm/src/host.rs b/runtime/wasm/src/host.rs index f67ea8d5ee7..aa079381a94 100644 --- a/runtime/wasm/src/host.rs +++ b/runtime/wasm/src/host.rs @@ -363,7 +363,7 @@ impl RuntimeHostTrait for RuntimeHost { DataSource::Onchain(_) => None, DataSource::Offchain(ds) => ds.done_at(), DataSource::Subgraph(_) => None, - DataSource::Nozzle(_) => None, + DataSource::Amp(_) => None, } } @@ -372,7 +372,7 @@ impl RuntimeHostTrait for RuntimeHost { DataSource::Onchain(_) => {} DataSource::Offchain(ds) => ds.set_done_at(block), DataSource::Subgraph(_) => {} - DataSource::Nozzle(_) => {} + DataSource::Amp(_) => {} } } diff --git a/server/index-node/src/resolver.rs b/server/index-node/src/resolver.rs index 9c59066b1c3..af6fd0888ec 100644 --- a/server/index-node/src/resolver.rs +++ b/server/index-node/src/resolver.rs @@ -7,6 +7,7 @@ use graph::schema::EntityType; use web3::types::Address; use git_testament::{git_testament, CommitKind}; +use graph::amp; use graph::blockchain::{Blockchain, BlockchainKind, BlockchainMap}; use graph::components::link_resolver::LinkResolverContext; use graph::components::store::{BlockPtrForNumber, BlockStore, QueryPermit, Store}; @@ -15,7 +16,6 @@ use graph::data::graphql::{object, IntoValue, ObjectOrInterface, ValueMap}; use graph::data::subgraph::{status, DeploymentFeatures}; use graph::data::value::Object; use graph::futures03::TryFutureExt; -use graph::nozzle; use graph::prelude::*; use graph_graphql::prelude::{a, ExecutionContext, Resolver}; @@ -96,25 +96,25 @@ impl IntoValue for PublicProofOfIndexingResult { /// Resolver for the index node GraphQL API. #[derive(Clone)] -pub struct IndexNodeResolver { +pub struct IndexNodeResolver { logger: Logger, blockchain_map: Arc, store: Arc, link_resolver: Arc, - nozzle_client: Option>, + amp_client: Option>, bearer_token: Option, } -impl IndexNodeResolver +impl IndexNodeResolver where S: Store, - NC: nozzle::Client + Send + Sync + 'static, + AC: amp::Client + Send + Sync + 'static, { pub fn new( logger: &Logger, store: Arc, link_resolver: Arc, - nozzle_client: Option>, + amp_client: Option>, bearer_token: Option, blockchain_map: Arc, ) -> Self { @@ -125,7 +125,7 @@ where blockchain_map, store, link_resolver, - nozzle_client, + amp_client, bearer_token, } } @@ -522,7 +522,7 @@ where deployment_hash.clone(), raw_yaml, &self.link_resolver, - self.nozzle_client.cheap_clone(), + self.amp_client.cheap_clone(), &self.logger, max_spec_version, ) @@ -540,7 +540,7 @@ where deployment_hash.clone(), raw_yaml, &self.link_resolver, - self.nozzle_client.cheap_clone(), + self.amp_client.cheap_clone(), &self.logger, max_spec_version, ) @@ -558,7 +558,7 @@ where deployment_hash.clone(), raw_yaml, &self.link_resolver, - self.nozzle_client.cheap_clone(), + self.amp_client.cheap_clone(), &self.logger, max_spec_version, ) @@ -693,10 +693,10 @@ where } #[async_trait] -impl BlockPtrForNumber for IndexNodeResolver +impl BlockPtrForNumber for IndexNodeResolver where S: Store, - NC: nozzle::Client + Send + Sync + 'static, + AC: amp::Client + Send + Sync + 'static, { async fn block_ptr_for_number( &self, @@ -770,10 +770,10 @@ fn entity_changes_to_graphql(entity_changes: Vec) -> r::Value { } #[async_trait] -impl Resolver for IndexNodeResolver +impl Resolver for IndexNodeResolver where S: Store, - NC: nozzle::Client + Send + Sync + 'static, + AC: amp::Client + Send + Sync + 'static, { const CACHEABLE: bool = false; diff --git a/server/index-node/src/server.rs b/server/index-node/src/server.rs index 138d5e85d69..00b62c09ca2 100644 --- a/server/index-node/src/server.rs +++ b/server/index-node/src/server.rs @@ -1,29 +1,29 @@ use graph::{ + amp, blockchain::BlockchainMap, cheap_clone::CheapClone, components::{ server::server::{start, ServerHandle}, store::Store, }, - nozzle, prelude::*, }; use crate::service::IndexNodeService; /// A GraphQL server based on Hyper. -pub struct IndexNodeServer { +pub struct IndexNodeServer { logger: Logger, blockchain_map: Arc, store: Arc, link_resolver: Arc, - nozzle_client: Option>, + amp_client: Option>, } -impl IndexNodeServer +impl IndexNodeServer where S: Store, - NC: nozzle::Client + Send + Sync + 'static, + AC: amp::Client + Send + Sync + 'static, { /// Creates a new GraphQL server. pub fn new( @@ -31,7 +31,7 @@ where blockchain_map: Arc, store: Arc, link_resolver: Arc, - nozzle_client: Option>, + amp_client: Option>, ) -> Self { let logger = logger_factory.component_logger( "IndexNodeServer", @@ -47,7 +47,7 @@ where blockchain_map, store, link_resolver, - nozzle_client, + amp_client, } } @@ -68,7 +68,7 @@ where self.blockchain_map.clone(), store, self.link_resolver.clone(), - self.nozzle_client.cheap_clone(), + self.amp_client.cheap_clone(), )); start(logger_for_service.clone(), port, move |req| { diff --git a/server/index-node/src/service.rs b/server/index-node/src/service.rs index 6dcf0138566..5aa00058e6c 100644 --- a/server/index-node/src/service.rs +++ b/server/index-node/src/service.rs @@ -15,9 +15,9 @@ use graph::hyper::header::{ }; use graph::hyper::{body::Body, Method, Request, Response, StatusCode}; +use graph::amp; use graph::components::{server::query::ServerError, store::Store}; use graph::data::query::{Query, QueryError, QueryResult, QueryResults}; -use graph::nozzle; use graph::prelude::{q, serde_json}; use graph::slog::{debug, error, Logger}; use graph_graphql::prelude::{execute_query, Query as PreparedQuery, QueryExecutionOptions}; @@ -40,26 +40,26 @@ impl GraphQLMetrics for NoopGraphQLMetrics { /// A Hyper Service that serves GraphQL over a POST / endpoint. #[derive(Debug)] -pub struct IndexNodeService { +pub struct IndexNodeService { logger: Logger, blockchain_map: Arc, store: Arc, explorer: Arc>, link_resolver: Arc, - nozzle_client: Option>, + amp_client: Option>, } -impl IndexNodeService +impl IndexNodeService where S: Store, - NC: nozzle::Client + Send + Sync + 'static, + AC: amp::Client + Send + Sync + 'static, { pub fn new( logger: Logger, blockchain_map: Arc, store: Arc, link_resolver: Arc, - nozzle_client: Option>, + amp_client: Option>, ) -> Self { let explorer = Arc::new(Explorer::new(store.clone())); @@ -69,7 +69,7 @@ where store, explorer, link_resolver, - nozzle_client, + amp_client, } } @@ -143,7 +143,7 @@ where &logger, store, self.link_resolver.clone(), - self.nozzle_client.cheap_clone(), + self.amp_client.cheap_clone(), validated.bearer_token, self.blockchain_map.clone(), ); diff --git a/store/test-store/tests/chain/ethereum/manifest.rs b/store/test-store/tests/chain/ethereum/manifest.rs index 7cd66ff48b5..f52930f71bd 100644 --- a/store/test-store/tests/chain/ethereum/manifest.rs +++ b/store/test-store/tests/chain/ethereum/manifest.rs @@ -4,6 +4,7 @@ use std::str::FromStr; use std::sync::Arc; use std::time::Duration; +use graph::amp; use graph::blockchain::DataSource; use graph::components::store::BLOCK_NUMBER_MAX; use graph::data::store::scalar::Bytes; @@ -17,7 +18,6 @@ use graph::data_source::offchain::OffchainDataSourceKind; use graph::data_source::{DataSourceEnum, DataSourceTemplate}; use graph::entity; use graph::env::ENV_VARS; -use graph::nozzle; use graph::prelude::web3::types::H256; use graph::prelude::{ anyhow, async_trait, serde_yaml, tokio, BigDecimal, BigInt, DeploymentHash, Link, @@ -143,7 +143,7 @@ async fn try_resolve_manifest( id, raw, &resolver, - Option::>::None, + Option::>::None, &LOGGER, max_spec_version, ) @@ -173,7 +173,7 @@ async fn resolve_unvalidated(text: &str) -> UnvalidatedSubgraphManifest { id, raw, &resolver, - Option::>::None, + Option::>::None, &LOGGER, SPEC_VERSION_0_0_4.clone(), ) @@ -1329,7 +1329,7 @@ schema: id, raw, &resolver, - Option::>::None, + Option::>::None, &LOGGER, SPEC_VERSION_0_0_4.clone(), ) @@ -1382,7 +1382,7 @@ schema: id, raw, &resolver, - Option::>::None, + Option::>::None, &LOGGER, SPEC_VERSION_0_0_4.clone(), ) @@ -1459,7 +1459,7 @@ dataSources: id, raw, &resolver, - Option::>::None, + Option::>::None, &LOGGER, SPEC_VERSION_0_0_4.clone(), ) @@ -1538,7 +1538,7 @@ dataSources: id, raw, &resolver, - Option::>::None, + Option::>::None, &LOGGER, SPEC_VERSION_0_0_4.clone(), ) @@ -1648,7 +1648,7 @@ dataSources: id, raw, &resolver, - Option::>::None, + Option::>::None, &LOGGER, SPEC_VERSION_1_2_0.clone(), ) @@ -1722,7 +1722,7 @@ dataSources: id, raw, &resolver, - Option::>::None, + Option::>::None, &LOGGER, SPEC_VERSION_1_3_0.clone(), ) @@ -1873,7 +1873,7 @@ specVersion: 1.3.0 id, raw, &resolver, - Option::>::None, + Option::>::None, &LOGGER, SPEC_VERSION_1_3_0.clone(), ) diff --git a/tests/src/fixture/mod.rs b/tests/src/fixture/mod.rs index b661f9e6d56..27daa844c1c 100644 --- a/tests/src/fixture/mod.rs +++ b/tests/src/fixture/mod.rs @@ -9,6 +9,7 @@ use std::time::{Duration, Instant}; use anyhow::Error; use async_stream::stream; +use graph::amp; use graph::blockchain::block_stream::{ BlockRefetcher, BlockStream, BlockStreamBuilder, BlockStreamError, BlockStreamEvent, BlockWithTriggers, FirehoseCursor, @@ -37,7 +38,6 @@ use graph::http_body_util::Full; use graph::hyper::body::Bytes; use graph::hyper::Request; use graph::ipfs::{IpfsClient, IpfsMetrics}; -use graph::nozzle; use graph::prelude::ethabi::ethereum_types::H256; use graph::prelude::serde_json::{self, json}; use graph::prelude::{ @@ -162,7 +162,7 @@ pub struct TestContext { pub instance_manager: Arc< graph_core::subgraph::SubgraphInstanceManager< graph_store_postgres::SubgraphStore, - nozzle::FlightClient, + amp::FlightClient, >, >, pub link_resolver: Arc, @@ -170,8 +170,7 @@ pub struct TestContext { pub env_vars: Arc, pub ipfs: Arc, graphql_runner: Arc, - indexing_status_service: - Arc>, + indexing_status_service: Arc>, } #[derive(Deserialize)] @@ -606,7 +605,7 @@ pub async fn setup_inner( subgraph_provider.cheap_clone(), subgraph_store.clone(), panicking_subscription_manager, - Option::>::None, + Option::>::None, blockchain_map.clone(), node_id.clone(), SubgraphVersionSwitchingMode::Instant, From 82cf29aab582c1e25d9391be706d9e6d972b89d1 Mon Sep 17 00:00:00 2001 From: Ion Suman <47307091+isum@users.noreply.github.com> Date: Wed, 5 Nov 2025 15:57:20 +0200 Subject: [PATCH 22/40] fix(graph): produce consistent query hashes for logging --- graph/src/amp/client/flight_client.rs | 27 +++++++++------------------ 1 file changed, 9 insertions(+), 18 deletions(-) diff --git a/graph/src/amp/client/flight_client.rs b/graph/src/amp/client/flight_client.rs index 097b66beca0..9b5bc8075ed 100644 --- a/graph/src/amp/client/flight_client.rs +++ b/graph/src/amp/client/flight_client.rs @@ -1,11 +1,6 @@ -use std::{ - collections::HashMap, - hash::{Hash, Hasher}, - ops::RangeInclusive, - time::Duration, -}; +use std::{collections::HashMap, ops::RangeInclusive, time::Duration}; -use ahash::AHasher; +use ahash::RandomState; use alloy::primitives::{BlockHash, BlockNumber}; use arrow::{datatypes::Schema, error::ArrowError}; use arrow_flight::{ @@ -109,9 +104,15 @@ impl Client for FlightClient { request_metadata: Option, ) -> BoxStream<'static, Result> { let query = query.to_string(); + + // Generates a hash from the SQL query for log correlation. + // The hash allows connecting related logs without including the full SQL query in every log message. + // Constant seeds ensure consistent hashes for the same query. + let hasher = RandomState::with_seeds(0, 0, 0, 0); + let logger = logger .component("AmpFlightClient") - .new(slog::o!("query_id" => query_id(&query))); + .new(slog::o!("query_hash" => hasher.hash_one(&query))); let mut raw_client = self.raw_client(); let mut prev_block_ranges: Vec = Vec::new(); @@ -306,16 +307,6 @@ impl From for BlockRange { } } -/// Generates an ID from a SQL query for log correlation. -/// -/// The ID allows connecting related logs without including the full SQL -/// query in every log message. -fn query_id(query: &str) -> u32 { - let mut hasher = AHasher::default(); - query.hash(&mut hasher); - hasher.finish() as u32 -} - /// Serializes the information required to resume a streaming SQL query to JSON. fn serialize_resume_streaming_query(resume_streaming_query: Vec) -> String { #[derive(Serialize)] From 762e27fd2d9d61b3611bae4597162af04e9bb414 Mon Sep 17 00:00:00 2001 From: Ion Suman <47307091+isum@users.noreply.github.com> Date: Wed, 5 Nov 2025 17:47:20 +0200 Subject: [PATCH 23/40] fix(core, graph): simplify SQL query requirements Only require block number columns and try to load block hashes and timestamps from the source tables --- .../amp_subgraph/runner/data_processing.rs | 2 +- core/src/amp_subgraph/runner/data_stream.rs | 2 +- graph/src/amp/codec/utils.rs | 64 ++++-- graph/src/amp/manifest/data_source/mod.rs | 4 +- graph/src/amp/manifest/data_source/raw.rs | 194 ++++++++++++++---- graph/src/amp/sql/mod.rs | 4 +- graph/src/amp/sql/query/filter_blocks.rs | 183 ----------------- graph/src/amp/sql/query/mod.rs | 153 -------------- .../amp/sql/query/resolve_event_signatures.rs | 110 ---------- .../amp/sql/query/resolve_source_address.rs | 81 -------- graph/src/amp/sql/query/validate_tables.rs | 91 -------- .../sql/query_builder/block_range_query.rs | 186 +++++++++++++++++ .../amp/sql/query_builder/context_query.rs | 103 ++++++++++ .../query_builder/event_signature_resolver.rs | 183 +++++++++++++++++ graph/src/amp/sql/query_builder/mod.rs | 191 +++++++++++++++++ graph/src/amp/sql/query_builder/parser.rs | 159 ++++++++++++++ .../query_builder/source_address_resolver.rs | 133 ++++++++++++ .../amp/sql/query_builder/table_extractor.rs | 153 ++++++++++++++ .../amp/sql/query_builder/table_validator.rs | 94 +++++++++ .../stream_aggregator/record_batch/decoder.rs | 4 +- .../record_batch/group_data.rs | 88 -------- 21 files changed, 1409 insertions(+), 773 deletions(-) delete mode 100644 graph/src/amp/sql/query/filter_blocks.rs delete mode 100644 graph/src/amp/sql/query/mod.rs delete mode 100644 graph/src/amp/sql/query/resolve_event_signatures.rs delete mode 100644 graph/src/amp/sql/query/resolve_source_address.rs delete mode 100644 graph/src/amp/sql/query/validate_tables.rs create mode 100644 graph/src/amp/sql/query_builder/block_range_query.rs create mode 100644 graph/src/amp/sql/query_builder/context_query.rs create mode 100644 graph/src/amp/sql/query_builder/event_signature_resolver.rs create mode 100644 graph/src/amp/sql/query_builder/mod.rs create mode 100644 graph/src/amp/sql/query_builder/parser.rs create mode 100644 graph/src/amp/sql/query_builder/source_address_resolver.rs create mode 100644 graph/src/amp/sql/query_builder/table_extractor.rs create mode 100644 graph/src/amp/sql/query_builder/table_validator.rs delete mode 100644 graph/src/nozzle/stream_aggregator/record_batch/group_data.rs diff --git a/core/src/amp_subgraph/runner/data_processing.rs b/core/src/amp_subgraph/runner/data_processing.rs index bb6dc87597a..eb7f303e367 100644 --- a/core/src/amp_subgraph/runner/data_processing.rs +++ b/core/src/amp_subgraph/runner/data_processing.rs @@ -215,7 +215,7 @@ fn decode_block_timestamp(record_batches: &[StreamRecordBatch]) -> Result { + Ok((_, decoder)) => { return decoder .decode(0) .map_err(|e| Error::Deterministic(e))? diff --git a/core/src/amp_subgraph/runner/data_stream.rs b/core/src/amp_subgraph/runner/data_stream.rs index 1e8211b3e8e..ec532f52adc 100644 --- a/core/src/amp_subgraph/runner/data_stream.rs +++ b/core/src/amp_subgraph/runner/data_stream.rs @@ -70,7 +70,7 @@ where } for (j, table) in data_source.transformer.tables.iter().enumerate() { - let query = table.query.with_block_range_filter(block_range); + let query = table.query.build_with_block_range(block_range); query_streams.push(cx.client.query(&cx.logger, query, None)); query_streams_table_ptr.push((i, j)); diff --git a/graph/src/amp/codec/utils.rs b/graph/src/amp/codec/utils.rs index dab04699005..4f6ba4ff0b1 100644 --- a/graph/src/amp/codec/utils.rs +++ b/graph/src/amp/codec/utils.rs @@ -10,61 +10,85 @@ use crate::amp::common::column_aliases; pub fn auto_block_number_decoder<'a>( record_batch: &'a RecordBatch, -) -> Result> + 'a>> { - let column_index = column_index(record_batch, column_aliases::BLOCK_NUMBER) - .context("failed to find block numbers column")?; +) -> Result<(&'static str, Box> + 'a>)> { + let (&column_name, column_index) = find_column(record_batch, column_aliases::BLOCK_NUMBER) + .with_context(|| { + format!( + "failed to find block numbers column; expected one of: {}", + column_aliases::BLOCK_NUMBER.join(", ") + ) + })?; block_number_decoder(record_batch, column_index) + .map(|decoder| (column_name, decoder)) + .with_context(|| format!("column '{column_name}' is not valid")) } pub fn block_number_decoder<'a>( record_batch: &'a RecordBatch, column_index: usize, ) -> Result> + 'a>> { - column_decoder::(record_batch, column_index) + column_decoder::(record_batch, column_index, false) } pub fn auto_block_hash_decoder<'a>( record_batch: &'a RecordBatch, -) -> Result> + 'a>> { - let column_index = column_index(record_batch, column_aliases::BLOCK_HASH) - .context("failed to find block hashes column")?; +) -> Result<(&'static str, Box> + 'a>)> { + let (&column_name, column_index) = find_column(record_batch, column_aliases::BLOCK_HASH) + .with_context(|| { + format!( + "failed to find block hashes column; expected one of: {}", + column_aliases::BLOCK_HASH.join(", ") + ) + })?; block_hash_decoder(record_batch, column_index) + .map(|decoder| (column_name, decoder)) + .with_context(|| format!("column '{column_name}' is not valid")) } pub fn block_hash_decoder<'a>( record_batch: &'a RecordBatch, column_index: usize, ) -> Result> + 'a>> { - column_decoder::(record_batch, column_index) + column_decoder::(record_batch, column_index, false) } pub fn auto_block_timestamp_decoder<'a>( record_batch: &'a RecordBatch, -) -> Result>> + 'a>> { - let column_index = column_index(record_batch, column_aliases::BLOCK_TIMESTAMP) - .context("failed to find block timestamps column")?; +) -> Result<(&'static str, Box>> + 'a>)> { + let (&column_name, column_index) = find_column(record_batch, column_aliases::BLOCK_TIMESTAMP) + .with_context(|| { + format!( + "failed to find block timestamps column; expected one of: {}", + column_aliases::BLOCK_TIMESTAMP.join(", ") + ) + })?; block_timestamp_decoder(record_batch, column_index) + .map(|decoder| (column_name, decoder)) + .with_context(|| format!("column '{column_name}' is not valid")) } pub fn block_timestamp_decoder<'a>( record_batch: &'a RecordBatch, column_index: usize, ) -> Result>> + 'a>> { - column_decoder::>(record_batch, column_index) + column_decoder::>(record_batch, column_index, false) } -pub fn column_index( +pub fn find_column( record_batch: &RecordBatch, - column_names: impl IntoIterator>, -) -> Option { + column_names: impl IntoIterator, +) -> Option<(T, usize)> +where + T: AsRef, +{ let schema_ref = record_batch.schema_ref(); for column_name in column_names { if let Some((column_index, _)) = schema_ref.column_with_name(column_name.as_ref()) { - return Some(column_index); + return Some((column_name, column_index)); } } @@ -74,16 +98,22 @@ pub fn column_index( pub fn column_decoder<'a, T: 'static, U>( record_batch: &'a RecordBatch, column_index: usize, + nullable: bool, ) -> Result> + 'a>> where T: Array, ArrayDecoder<'a, T>: Decoder>, { if column_index >= record_batch.num_columns() { - bail!("column {column_index} does not exist"); + bail!("column does not exist"); } let array = record_batch.column(column_index); + + if !nullable && array.is_nullable() { + bail!("column must not have nullable values"); + } + let decoder = ArrayDecoder::::new(array)?; Ok(Box::new(decoder)) diff --git a/graph/src/amp/manifest/data_source/mod.rs b/graph/src/amp/manifest/data_source/mod.rs index 85de05ec951..ff2a5003bb5 100644 --- a/graph/src/amp/manifest/data_source/mod.rs +++ b/graph/src/amp/manifest/data_source/mod.rs @@ -8,7 +8,7 @@ use arrow::datatypes::Schema; use semver::Version; use crate::{ - amp::{common::Ident, sql::Query}, + amp::{common::Ident, sql::BlockRangeQueryBuilder}, data::subgraph::SPEC_VERSION_1_5_0, }; @@ -106,7 +106,7 @@ pub struct Table { /// The SQL query that executes on the Amp server. /// /// The data resulting from this SQL query execution transforms into subgraph entities. - pub query: Query, + pub query: BlockRangeQueryBuilder, /// The Arrow schema of this transformed table SQL query. /// diff --git a/graph/src/amp/manifest/data_source/raw.rs b/graph/src/amp/manifest/data_source/raw.rs index 197aacd58b0..10e065544d3 100644 --- a/graph/src/amp/manifest/data_source/raw.rs +++ b/graph/src/amp/manifest/data_source/raw.rs @@ -5,20 +5,23 @@ use alloy::{ primitives::{Address, BlockNumber}, }; use anyhow::anyhow; -use arrow::datatypes::Schema; +use arrow::{array::RecordBatch, datatypes::Schema}; use futures03::future::try_join_all; use semver::Version; use serde::Deserialize; -use slog::Logger; +use slog::{debug, error, Logger}; use thiserror::Error; use super::{Abi, DataSource, Source, Table, Transformer}; use crate::{ amp::{ self, - common::{column_aliases, Ident}, + codec::utils::{ + auto_block_hash_decoder, auto_block_number_decoder, auto_block_timestamp_decoder, + }, + common::Ident, error::IsDeterministic, - sql::Query, + sql::{BlockRangeQueryBuilder, ContextQuery, ValidQuery}, }, components::link_resolver::{LinkResolver, LinkResolverContext}, data::subgraph::DeploymentHash, @@ -70,6 +73,9 @@ impl RawDataSource { transformer, } = self; + let logger = logger.new(slog::o!("data_source" => name.clone())); + debug!(logger, "Resolving data source"); + let name = Self::resolve_name(name)?; Self::resolve_kind(kind)?; @@ -78,7 +84,7 @@ impl RawDataSource { .map_err(|e| e.source_context("invalid `source`"))?; let transformer = transformer - .resolve(logger, link_resolver, amp_client, &source) + .resolve(&logger, link_resolver, amp_client, &source) .await .map_err(|e| e.source_context("invalid `transformer`"))?; @@ -142,6 +148,7 @@ impl RawSource { start_block, end_block, } = self; + let dataset = Self::resolve_dataset(dataset)?; let tables = Self::resolve_tables(tables)?; let address = address.unwrap_or(Address::ZERO); @@ -265,7 +272,12 @@ impl RawTransformer { } let abi_futs = abis.into_iter().enumerate().map(|(i, abi)| async move { - abi.resolve(logger, link_resolver) + let logger = logger.new(slog::o!("abi_name" => abi.name.clone())); + debug!(logger, "Resolving ABI"; + "file" => &abi.file, + ); + + abi.resolve(&logger, link_resolver) .await .map_err(|e| e.source_context(format!("invalid `abis` at index {i}"))) }); @@ -294,8 +306,13 @@ impl RawTransformer { } let table_futs = tables.into_iter().enumerate().map(|(i, table)| async move { + let logger = logger.new(slog::o!("table_name" => table.name.clone())); + debug!(logger, "Resolving table"; + "file" => ?&table.file + ); + table - .resolve(logger, link_resolver, amp_client, source, abis) + .resolve(&logger, link_resolver, amp_client, source, abis) .await .map_err(|e| e.source_context(format!("invalid `tables` at index {i}"))) }); @@ -394,16 +411,27 @@ impl RawTable { abis: &[Abi], ) -> Result { let Self { name, query, file } = self; + let name = Self::resolve_name(name)?; let query = match Self::resolve_query(query, source, abis)? { Some(query) => query, None => Self::resolve_file(logger, link_resolver, file, source, abis).await?, }; + + debug!(logger, "Resolving query schema"); let schema = Self::resolve_schema(logger, amp_client, &query).await?; + let block_range_query_builder = Self::resolve_block_range_query_builder( + logger, + amp_client, + source, + query, + schema.clone(), + ) + .await?; Ok(Table { name, - query, + query: block_range_query_builder, schema, }) } @@ -416,7 +444,7 @@ impl RawTable { query: Option, source: &Source, abis: &[Abi], - ) -> Result, Error> { + ) -> Result, Error> { let Some(query) = query else { return Ok(None); }; @@ -425,12 +453,12 @@ impl RawTable { return Err(Error::InvalidValue(anyhow!("`query` cannot be empty"))); } - Query::new( - query, - &source.dataset, - &source.tables, + ValidQuery::new( + &query, + source.dataset.as_str(), + source.tables.iter().map(|table| table.as_str()), &source.address, - abis.iter().map(|abi| (&abi.name, &abi.contract)), + abis.iter().map(|abi| (abi.name.as_str(), &abi.contract)), ) .map(Some) .map_err(|e| Error::InvalidValue(e.context("invalid `query`"))) @@ -442,7 +470,9 @@ impl RawTable { file: Option, source: &Source, abis: &[Abi], - ) -> Result { + ) -> Result { + debug!(logger, "Resolving query file"); + let Some(file) = file else { return Err(Error::InvalidValue(anyhow!("`file` cannot be empty"))); }; @@ -466,12 +496,12 @@ impl RawTable { return Err(Error::InvalidValue(anyhow!("`file` cannot be empty"))); } - Query::new( - query, - &source.dataset, - &source.tables, + ValidQuery::new( + &query, + source.dataset.as_str(), + source.tables.iter().map(|table| table.as_str()), &source.address, - abis.iter().map(|abi| (&abi.name, &abi.contract)), + abis.iter().map(|abi| (abi.name.as_str(), &abi.contract)), ) .map_err(|e| Error::InvalidValue(e.context("invalid `file`"))) } @@ -479,32 +509,112 @@ impl RawTable { async fn resolve_schema( logger: &Logger, amp_client: &impl amp::Client, - query: &Query, + query: impl ToString, ) -> Result { - let schema = - amp_client - .schema(logger, &query) - .await - .map_err(|e| Error::FailedToExecuteQuery { - is_deterministic: e.is_deterministic(), - source: anyhow!(e).context("failed to load schema"), - })?; - - let check_required_column = |c: &[&str], kind: &str| { - if !c.iter().any(|&c| schema.column_with_name(c).is_some()) { - return Err(Error::InvalidQuery(anyhow!( - "query must return {kind}; expected column names are: {}", - c.join(", ") - ))); + amp_client + .schema(logger, query) + .await + .map_err(|e| Error::FailedToExecuteQuery { + is_deterministic: e.is_deterministic(), + source: anyhow!(e).context("failed to load schema"), + }) + } + + async fn resolve_block_range_query_builder( + logger: &Logger, + amp_client: &impl amp::Client, + source: &Source, + query: ValidQuery, + schema: Schema, + ) -> Result { + debug!(logger, "Resolving block range query builder"); + + let record_batch = RecordBatch::new_empty(schema.into()); + let (block_number_column, _) = + auto_block_number_decoder(&record_batch).map_err(|e| Error::InvalidQuery(e))?; + + let has_block_hash_column = auto_block_hash_decoder(&record_batch).is_ok(); + let has_block_timestamp_column = auto_block_timestamp_decoder(&record_batch).is_ok(); + + if has_block_hash_column && has_block_timestamp_column { + return Ok(BlockRangeQueryBuilder::new(query, block_number_column)); + } + + debug!(logger, "Resolving context query"); + let mut context_query: Option = None; + + // TODO: Context is embedded in the original query using INNER JOIN to ensure availability for every output row. + // This requires all source tables to match or exceed the expected query output size. + let context_sources_iter = source + .tables + .iter() + .map(|table| (source.dataset.as_str(), table.as_str())) + // TODO: Replace hardcoded values with schema metadata sources when available + .chain([("eth_firehose", "blocks"), ("eth_rpc", "blocks")]); + + for (dataset, table) in context_sources_iter { + let context_logger = logger.new(slog::o!( + "context_dataset" => dataset.to_string(), + "context_table" => table.to_string() + )); + debug!(context_logger, "Loading context schema"); + let schema_query = format!("SELECT * FROM {dataset}.{table}"); + let schema = match Self::resolve_schema(logger, amp_client, schema_query).await { + Ok(schema) => schema, + Err(e) => { + error!(context_logger, "Failed to load context schema"; + "e" => ?e + ); + continue; + } + }; + + let record_batch = RecordBatch::new_empty(schema.clone().into()); + let mut columns = Vec::new(); + + if !has_block_hash_column { + let Ok((block_hash_column, _)) = auto_block_hash_decoder(&record_batch) else { + debug!( + context_logger, + "Context schema does not contain block hash column, skipping" + ); + continue; + }; + + columns.push(block_hash_column); } - Ok(()) - }; - check_required_column(column_aliases::BLOCK_NUMBER, "block numbers")?; - check_required_column(column_aliases::BLOCK_HASH, "block hashes")?; - check_required_column(column_aliases::BLOCK_TIMESTAMP, "block timestamps")?; + if !has_block_timestamp_column { + let Ok((block_timestamp_column, _)) = auto_block_timestamp_decoder(&record_batch) + else { + debug!( + context_logger, + "Context schema does not contain block timestamp column, skipping" + ); + continue; + }; + + columns.push(block_timestamp_column); + } + + debug!(context_logger, "Creating context query"); + context_query = Some(ContextQuery::new( + query, + block_number_column, + dataset, + table, + columns, + )); + break; + } + + if let Some(context_query) = context_query { + return Ok(BlockRangeQueryBuilder::new_with_context(context_query)); + } - Ok(schema) + Err(Error::InvalidQuery(anyhow!( + "query is required to output block numbers, block hashes and block timestamps" + ))) } } diff --git a/graph/src/amp/sql/mod.rs b/graph/src/amp/sql/mod.rs index a8f43f9078c..02355895afa 100644 --- a/graph/src/amp/sql/mod.rs +++ b/graph/src/amp/sql/mod.rs @@ -1,3 +1,3 @@ -pub mod query; +pub mod query_builder; -pub use self::query::Query; +pub use self::query_builder::{BlockRangeQueryBuilder, ContextQuery, ValidQuery}; diff --git a/graph/src/amp/sql/query/filter_blocks.rs b/graph/src/amp/sql/query/filter_blocks.rs deleted file mode 100644 index ad2fbae859a..00000000000 --- a/graph/src/amp/sql/query/filter_blocks.rs +++ /dev/null @@ -1,183 +0,0 @@ -use std::{ - collections::BTreeMap, - ops::{ControlFlow, RangeInclusive}, -}; - -use alloy::primitives::BlockNumber; -use sqlparser_latest::ast::{self, VisitMut, VisitorMut}; - -use super::parse; -use crate::{amp::common::Ident, cheap_clone::CheapClone}; - -/// Applies a block range filter to the SQL query. -/// -/// Creates temporary ordered result sets for each table in the dataset, limiting -/// the blocks processed during execution. -/// -/// The temporary result sets replace the tables referenced in the SQL query. -/// -/// This ensures deterministic output during query execution and enables resuming -/// after failures or when new blocks are available. -pub(super) fn filter_blocks( - query: &mut ast::Query, - dataset: &Ident, - tables: &[Ident], - block_range: &RangeInclusive, -) { - let tables_to_cte_mapping = tables_to_cte_mapping(dataset, tables); - - let mut table_to_cte_replacer = TableToCteReplacer::new(dataset, &tables_to_cte_mapping); - let _: ControlFlow<()> = VisitMut::visit(query, &mut table_to_cte_replacer); - - match &mut query.with { - Some(with) => { - remove_cte_filters(&mut with.cte_tables, &tables_to_cte_mapping); - - add_cte_filters( - &mut with.cte_tables, - dataset, - &tables_to_cte_mapping, - block_range, - ); - } - None => { - let mut cte_tables = Vec::new(); - - add_cte_filters( - &mut cte_tables, - dataset, - &tables_to_cte_mapping, - block_range, - ); - - query.with = Some(ast::With { - with_token: ast::helpers::attached_token::AttachedToken::empty(), - recursive: false, - cte_tables, - }) - } - } -} - -// Maps `dataset` and `tables` to consistent names for temporary result sets. -fn tables_to_cte_mapping(dataset: &Ident, tables: &[Ident]) -> BTreeMap { - tables - .into_iter() - .map(|table| (table.cheap_clone(), format!("sg_{dataset}_{table}"))) - .collect() -} - -/// Removes previously added temporary result sets from the SQL query. -fn remove_cte_filters(ctes: &mut Vec, tables_to_cte_mapping: &BTreeMap) { - ctes.retain(|cte| { - !tables_to_cte_mapping - .values() - .any(|cte_table| *cte_table == cte.alias.name.value) - }); -} - -/// Creates temporary result sets for each table in the dataset and adds them to the SQL query. -fn add_cte_filters( - ctes: &mut Vec, - dataset: &Ident, - tables_to_cte_mapping: &BTreeMap, - block_range: &RangeInclusive, -) { - let mut output_ctes = Vec::with_capacity(ctes.len() + tables_to_cte_mapping.len()); - - for (table, cte_table) in tables_to_cte_mapping { - let query = parse::query(format!( - "SELECT * FROM {dataset}.{table} WHERE _block_num BETWEEN {} AND {} ORDER BY _block_num ASC", - block_range.start(), - block_range.end() - )) - .unwrap(); - - output_ctes.push(ast::Cte { - alias: ast::TableAlias { - name: ast::Ident::new(cte_table), - columns: Vec::new(), - }, - query: Box::new(query), - from: None, - materialized: None, - closing_paren_token: ast::helpers::attached_token::AttachedToken::empty(), - }); - } - - output_ctes.append(ctes); - let _empty = std::mem::replace(ctes, output_ctes); -} - -/// Walks the SQL AST and replaces each table reference with a temporary result set name. -struct TableToCteReplacer<'a> { - dataset: &'a Ident, - tables_to_cte_mapping: &'a BTreeMap, -} - -impl<'a> TableToCteReplacer<'a> { - /// Creates a new replacer. - fn new(dataset: &'a Ident, tables_to_cte_mapping: &'a BTreeMap) -> Self { - Self { - dataset, - tables_to_cte_mapping, - } - } - - /// Makes the `table_factor` reference a temporary result set instead of a table. - /// - /// Ignores unrelated table factors and table references without a namespace because - /// they might reference other CTEs. - fn visit_table_factor(&self, table_factor: &mut ast::TableFactor) { - let ast::TableFactor::Table { name, alias, .. } = table_factor else { - return; - }; - - let mut iter = name.0.iter().rev().map(|part| match part { - ast::ObjectNamePart::Identifier(ident) => ident.value.as_str(), - }); - - let Some(table) = iter.next() else { - return; - }; - - let Some(dataset) = iter.next() else { - return; - }; - - let (Ok(dataset), Ok(table)) = (Ident::new(dataset), Ident::new(table)) else { - return; - }; - - if *self.dataset != dataset { - return; - } - - let Some(cte_table) = self.tables_to_cte_mapping.get(&table) else { - return; - }; - - if alias.is_none() { - *alias = Some(ast::TableAlias { - name: ast::Ident::new(table.as_str()), - columns: Vec::new(), - }) - } - - *name = ast::ObjectName(vec![ast::ObjectNamePart::Identifier(ast::Ident::new( - cte_table, - ))]); - } -} - -impl<'a> VisitorMut for TableToCteReplacer<'a> { - type Break = (); - - fn pre_visit_table_factor( - &mut self, - table_factor: &mut ast::TableFactor, - ) -> ControlFlow { - self.visit_table_factor(table_factor); - ControlFlow::Continue(()) - } -} diff --git a/graph/src/amp/sql/query/mod.rs b/graph/src/amp/sql/query/mod.rs deleted file mode 100644 index 8918da2009f..00000000000 --- a/graph/src/amp/sql/query/mod.rs +++ /dev/null @@ -1,153 +0,0 @@ -mod filter_blocks; -mod resolve_event_signatures; -mod resolve_source_address; -mod validate_tables; - -use std::{fmt, ops::RangeInclusive, sync::Arc}; - -use alloy::{ - json_abi::JsonAbi, - primitives::{Address, BlockNumber}, -}; -use anyhow::{bail, Context, Result}; -use itertools::Itertools; -use sqlparser_latest::ast; - -use crate::{amp::common::Ident, cheap_clone::CheapClone}; - -/// Represents a valid SQL query of a Amp subgraph. -/// -/// Parses, validates and resolves a SQL query and prepares it for execution on a Amp server. -/// The data returned by executing this query is used to create subgraph entities. -#[derive(Debug, Clone)] -pub struct Query { - /// The raw SQL AST that represents the SQL query. - ast: ast::Query, - - /// The dataset that the SQL query requests data from. - dataset: Ident, - - /// The tables that the SQL query requests data from. - tables: Arc<[Ident]>, -} - -impl Query { - /// Parses, validates and resolves a SQL query and prepares it for execution on a Amp server. - /// - /// # Errors - /// - /// Returns an error if: - /// - The SQL query cannot be parsed - /// - The SQL query is not valid - /// - The SQL query cannot be resolved - /// - /// The returned error is deterministic. - pub fn new<'a>( - sql: impl AsRef, - dataset: &Ident, - tables: &[Ident], - source_address: &Address, - abis: impl IntoIterator, - ) -> Result { - let mut query = parse::query(sql).context("failed to parse SQL query")?; - let abis = abis.into_iter().collect_vec(); - - Self::validate(&query, dataset, tables).context("failed to validate SQL query")?; - Self::resolve(&mut query, source_address, &abis).context("failed to resolve SQL query")?; - - Ok(Self { - ast: query, - dataset: dataset.cheap_clone(), - tables: tables.into(), - }) - } - - /// Applies a block range filter to this SQL query and returns the updated query. - /// - /// Creates temporary ordered result sets for each table in the dataset, limiting - /// the blocks processed during execution. - /// - /// The temporary result sets replace the tables referenced in this SQL query. - /// - /// This ensures deterministic output during query execution and enables resuming - /// after failures or when new blocks are available. - pub fn with_block_range_filter(&self, block_range: &RangeInclusive) -> Self { - let mut query = self.clone(); - filter_blocks::filter_blocks(&mut query.ast, &query.dataset, &query.tables, &block_range); - query - } - - /// Validates the SQL query. - /// - /// # Errors - /// - /// Returns an error if: - /// - The SQL query references unknown tables and datasets - /// - The SQL query uses custom `SETTINGS` - /// - /// The returned error is deterministic. - fn validate(query: &ast::Query, dataset: &Ident, tables: &[Ident]) -> Result<()> { - validate_tables::validate_tables(query, dataset, tables)?; - - if query.settings.is_some() { - bail!("custom SETTINGS are not allowed"); - } - - Ok(()) - } - - /// Resolves subgraph-specific function calls in the SQL query. - /// - /// # Errors - /// - /// Returns an error if: - /// - Source address function calls cannot be resolved - /// - Event signature function calls cannot be resolved - /// - /// The returned error is deterministic. - fn resolve( - query: &mut ast::Query, - source_address: &Address, - abis: &[(&Ident, &JsonAbi)], - ) -> Result<()> { - resolve_source_address::resolve_source_address(query, source_address)?; - resolve_event_signatures::resolve_event_signatures(query, abis)?; - - Ok(()) - } -} - -impl fmt::Display for Query { - fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result { - write!(f, "{}", self.ast) - } -} - -mod parse { - use anyhow::{anyhow, bail, Context, Result}; - use itertools::Itertools; - use sqlparser_latest::{ast, dialect::GenericDialect, parser::Parser}; - - /// Parses a SQL query and returns its AST. - /// - /// # Errors - /// - /// Returns an error if: - /// - The SQL query cannot be parsed - /// - The SQL query has multiple SQL statements - /// - The SQL query is not a `SELECT` query - pub(super) fn query(s: impl AsRef) -> Result { - let statement = Parser::parse_sql(&GenericDialect {}, s.as_ref()) - .context("invalid SQL query")? - .into_iter() - .exactly_one() - .map_err(|e| anyhow!("expected exactly one SQL statement, found {}", e.count()))?; - - let query = match statement { - ast::Statement::Query(query) => *query, - _ => bail!("invalid SQL query: only SELECT statements are allowed"), - }; - - Ok(query) - } -} diff --git a/graph/src/amp/sql/query/resolve_event_signatures.rs b/graph/src/amp/sql/query/resolve_event_signatures.rs deleted file mode 100644 index c146e0e4050..00000000000 --- a/graph/src/amp/sql/query/resolve_event_signatures.rs +++ /dev/null @@ -1,110 +0,0 @@ -use std::ops::ControlFlow; - -use alloy::json_abi::JsonAbi; -use anyhow::{bail, Context, Result}; -use sqlparser_latest::ast::{self, visit_expressions_mut}; - -use crate::amp::common::Ident; - -static FUNCTION_NAME: &str = "sg_event_signature"; - -/// Replaces `sg_event_signature('CONTRACT_NAME', 'EVENT_NAME')` function calls with -/// the correct event signature based on `abis`. -/// -/// # Errors -/// -/// Returns an error if: -/// - The function is called with incorrect arguments -/// - The contract name is not found in `abis` -/// - The event name is not found in `abis` -/// -/// The returned error is deterministic. -pub(super) fn resolve_event_signatures( - query: &mut ast::Query, - abis: &[(&Ident, &JsonAbi)], -) -> Result<()> { - let visit_result = visit_expressions_mut(query, |expr| match visit_expr(expr, abis) { - Ok(()) => ControlFlow::Continue(()), - Err(e) => ControlFlow::Break(e), - }); - - if let ControlFlow::Break(e) = visit_result { - return Err(e).with_context(|| format!("failed to resolve '{FUNCTION_NAME}' calls")); - } - - Ok(()) -} - -fn visit_expr(expr: &mut ast::Expr, abis: &[(&Ident, &JsonAbi)]) -> Result<()> { - let ast::Expr::Function(function) = expr else { - return Ok(()); - }; - - let mut ident_iter = function.name.0.iter().rev(); - let Some(ast::ObjectNamePart::Identifier(ident)) = ident_iter.next() else { - return Ok(()); - }; - - if !FUNCTION_NAME.eq_ignore_ascii_case(&ident.value) { - return Ok(()); - } - - if ident_iter.next().is_some() { - return Ok(()); - } - - let Some((contract_name, event_name)) = get_args(function) else { - bail!("invalid function call: expected `{FUNCTION_NAME}('CONTRACT_NAME', 'EVENT_NAME')`, found: `{function}`"); - }; - - let Some(event) = get_event(abis, contract_name, event_name) else { - bail!("invalid function call: unknown contract '{contract_name}' or event '{event_name}'"); - }; - - let signature = ast::Value::SingleQuotedString(event.full_signature()).with_empty_span(); - *expr = ast::Expr::Value(signature); - - Ok(()) -} - -fn get_args<'a>(function: &'a ast::Function) -> Option<(&'a str, &'a str)> { - let ast::FunctionArguments::List(args) = &function.args else { - return None; - }; - - if args.args.len() != 2 { - return None; - } - - match (get_arg(&args.args[0]), get_arg(&args.args[1])) { - (Some(contract_name), Some(event_name)) => Some((contract_name, event_name)), - _ => None, - } -} - -fn get_arg<'a>(arg: &'a ast::FunctionArg) -> Option<&'a str> { - let ast::FunctionArg::Unnamed(ast::FunctionArgExpr::Expr(expr)) = arg else { - return None; - }; - - match expr { - ast::Expr::Value(ast::ValueWithSpan { - value: ast::Value::SingleQuotedString(value), - .. - }) if !value.is_empty() => Some(value), - _ => None, - } -} - -fn get_event<'a>( - abis: &'a [(&Ident, &JsonAbi)], - contract_name: &str, - event_name: &str, -) -> Option<&'a alloy::json_abi::Event> { - abis.iter() - .find(|(name, _)| name.as_str() == contract_name) - .map(|(_, contract)| contract.event(event_name)) - .flatten() - .map(|events| events.first()) - .flatten() -} diff --git a/graph/src/amp/sql/query/resolve_source_address.rs b/graph/src/amp/sql/query/resolve_source_address.rs deleted file mode 100644 index 8c0d7faaccf..00000000000 --- a/graph/src/amp/sql/query/resolve_source_address.rs +++ /dev/null @@ -1,81 +0,0 @@ -use std::ops::ControlFlow; - -use alloy::primitives::Address; -use anyhow::{bail, Context, Result}; -use sqlparser_latest::ast::{self, visit_expressions_mut}; - -static FUNCTION_NAME: &str = "sg_source_address"; - -/// Replaces `sg_source_address()` function calls in the SQL query with the `source_address`. -/// -/// # Errors -/// -/// Returns an error if the function is called with any arguments. -/// -/// The returned error is deterministic. -pub(super) fn resolve_source_address( - query: &mut ast::Query, - source_address: &Address, -) -> Result<()> { - let visit_result = - visit_expressions_mut(query, |expr| match visit_expr(expr, source_address) { - Ok(()) => ControlFlow::Continue(()), - Err(e) => ControlFlow::Break(e), - }); - - if let ControlFlow::Break(e) = visit_result { - return Err(e).with_context(|| format!("failed to resolve '{FUNCTION_NAME}' calls")); - } - - Ok(()) -} - -fn visit_expr(expr: &mut ast::Expr, source_address: &Address) -> Result<()> { - let ast::Expr::Function(function) = expr else { - return Ok(()); - }; - - let mut ident_iter = function.name.0.iter().rev(); - let Some(ast::ObjectNamePart::Identifier(ident)) = ident_iter.next() else { - return Ok(()); - }; - - if !FUNCTION_NAME.eq_ignore_ascii_case(&ident.value) { - return Ok(()); - } - - if ident_iter.next().is_some() { - return Ok(()); - } - - if !matches!(function.args, ast::FunctionArguments::None) { - bail!("invalid function call: function '{FUNCTION_NAME}' does not accept arguments"); - } - - *function = ast::Function { - name: ast::ObjectName(vec![ast::ObjectNamePart::Identifier(ast::Ident::new( - "arrow_cast", - ))]), - uses_odbc_syntax: false, - parameters: ast::FunctionArguments::None, - args: ast::FunctionArguments::List(ast::FunctionArgumentList { - duplicate_treatment: None, - args: vec![ - ast::FunctionArg::Unnamed(ast::FunctionArgExpr::Expr(ast::Expr::Value( - ast::Value::HexStringLiteral(hex::encode(source_address)).with_empty_span(), - ))), - ast::FunctionArg::Unnamed(ast::FunctionArgExpr::Expr(ast::Expr::Value( - ast::Value::SingleQuotedString("FixedSizeBinary(20)".to_string()) - .with_empty_span(), - ))), - ], - clauses: vec![], - }), - filter: None, - null_treatment: None, - over: None, - within_group: vec![], - }; - - Ok(()) -} diff --git a/graph/src/amp/sql/query/validate_tables.rs b/graph/src/amp/sql/query/validate_tables.rs deleted file mode 100644 index f9a156ea425..00000000000 --- a/graph/src/amp/sql/query/validate_tables.rs +++ /dev/null @@ -1,91 +0,0 @@ -use std::ops::ControlFlow; - -use anyhow::{anyhow, bail, Error, Result}; -use sqlparser_latest::ast::{self, Visit, Visitor}; - -use crate::amp::common::Ident; - -/// Validates the dataset and tables used by the SQL query to ensure consistency with the explicitly declared ones. -/// -/// Checks every table reference in the SQL query and verifies that they match the `dataset` and `tables`. -/// Ignores table references not in `namespace.table` format as they may reference CTEs. -/// -/// # Errors -/// -/// Returns an error if: -/// - The SQL query references a dataset that is not equal to `dataset` -/// - The SQL query references a table that is not in the `tables` list -/// -/// The returned error is deterministic. -pub(super) fn validate_tables(query: &ast::Query, dataset: &Ident, tables: &[Ident]) -> Result<()> { - let mut table_validator = TableValidator { dataset, tables }; - if let ControlFlow::Break(e) = Visit::visit(query, &mut table_validator) { - return Err(e); - } - Ok(()) -} - -/// Walks the SQL AST and validates every table reference. -struct TableValidator<'a> { - dataset: &'a Ident, - tables: &'a [Ident], -} - -impl<'a> TableValidator<'a> { - /// Validates that the `table_factor` references the explicitly declared dataset and tables. - /// - /// Ignores unrelated table factors and table references without a namespace as they may reference CTEs. - /// - /// # Errors - /// - /// Returns an error if: - /// - The `table_factor` references a dataset that is not equal to `dataset` - /// - The `table_factor` references a table that is not in the `tables` list - /// - /// The returned error is deterministic. - fn visit_table_factor(&self, table_factor: &ast::TableFactor) -> Result<()> { - let ast::TableFactor::Table { name, .. } = table_factor else { - return Ok(()); - }; - - let mut ident_iter = name.0.iter().rev().map(|part| match part { - ast::ObjectNamePart::Identifier(ident) => Ident::new(ident.value.as_str()), - }); - - let Some(table) = ident_iter.next() else { - return Ok(()); - }; - - let Some(dataset) = ident_iter.next() else { - return Ok(()); - }; - - let table = table?; - let dataset = dataset?; - - if *self.dataset != dataset { - bail!("'{name}': invalid dataset '{dataset}'"); - } - - if !self.tables.iter().any(|t| *t == table) { - bail!("'{name}': invalid table '{table}'"); - } - - Ok(()) - } -} - -impl<'a> Visitor for TableValidator<'a> { - type Break = Error; - - fn post_visit_table_factor( - &mut self, - table_factor: &ast::TableFactor, - ) -> ControlFlow { - if let Err(e) = self.visit_table_factor(table_factor) { - return ControlFlow::Break(anyhow!("failed to validate table {e:#}")); - } - - ControlFlow::Continue(()) - } -} diff --git a/graph/src/amp/sql/query_builder/block_range_query.rs b/graph/src/amp/sql/query_builder/block_range_query.rs new file mode 100644 index 00000000000..e82966a5346 --- /dev/null +++ b/graph/src/amp/sql/query_builder/block_range_query.rs @@ -0,0 +1,186 @@ +use std::{ + collections::BTreeMap, + hash::{BuildHasher, Hash, Hasher}, + ops::{ControlFlow, RangeInclusive}, +}; + +use ahash::RandomState; +use alloy::primitives::BlockNumber; +use sqlparser_latest::ast::{self, VisitMut, VisitorMut}; + +use super::{extract_tables, normalize_table, parse_query}; + +/// Limits the query execution to the specified block range. +/// +/// Wraps the `query` in a CTE, and creates CTEs for every table it references. +/// These CTEs load data from the referenced tables only on the specified block range. +/// All the table references in the original SQL query are replaced with the created CTE names. +/// +/// The output is ordered by block numbers. +pub(super) fn new_block_range_query<'a>( + query: &ast::Query, + block_number_column: &str, + block_range: &RangeInclusive, +) -> ast::Query { + // CTE names are unique within a SQL query. + // The hasher ensures that CTEs created for block range do not collide with user-defined CTEs. + // Constant seeds ensure consistent block range queries for the same input parameters. + let mut hasher = RandomState::with_seeds(0, 0, 0, 0).build_hasher(); + + let tables_to_ctes_mapping = new_tables_to_ctes_mapping(query, &mut hasher); + assert!(!tables_to_ctes_mapping.is_empty()); + + let mut cte_tables = Vec::with_capacity(tables_to_ctes_mapping.len()); + for (table, cte_table) in &tables_to_ctes_mapping { + cte_tables.push(format!( + "{cte_table} AS (SELECT * FROM {table} WHERE _block_num BETWEEN {start_block} AND {end_block})", + start_block = block_range.start(), + end_block = block_range.end() + )) + } + + let mut query = query.clone(); + let mut table_replacer = TableReplacer::new(tables_to_ctes_mapping); + let _: ControlFlow<()> = VisitMut::visit(&mut query, &mut table_replacer); + + let block_range_query = format!( + "WITH {cte_tables}, {source} AS ({query}) SELECT {source}.* FROM {source} ORDER BY {source}.{block_number_column}", + cte_tables = cte_tables.join(", "), + source = format!("source_{}", hasher.finish()) + ); + + parse_query(block_range_query).unwrap() +} + +/// Creates unique CTE names for every table referenced by the SQL query. +fn new_tables_to_ctes_mapping( + query: &ast::Query, + hasher: &mut impl Hasher, +) -> BTreeMap { + extract_tables(query) + .into_iter() + .map(|table| { + table.hash(hasher); + + (table, format!("block_range_{}", hasher.finish())) + }) + .collect() +} + +/// Visits the SQL query AST and replaces referenced table names with CTE names. +struct TableReplacer { + tables_to_ctes_mapping: BTreeMap, +} + +impl TableReplacer { + /// Creates a new table replacer. + fn new(tables_to_ctes_mapping: BTreeMap) -> Self { + Self { + tables_to_ctes_mapping, + } + } + + /// Replaces the table name of the current `table_factor` with the associated CTE name. + fn visit_table_factor(&mut self, table_factor: &mut ast::TableFactor) { + let ast::TableFactor::Table { name, alias, .. } = table_factor else { + return; + }; + + let Some(cte_table) = self.tables_to_ctes_mapping.get(&normalize_table(name)) else { + return; + }; + + // Set the alias to the original table name so that queries like `SELECT table.column FROM table` do not break + if alias.is_none() { + let last_name_part = name.0.last().unwrap(); + + *alias = Some(ast::TableAlias { + name: last_name_part.as_ident().unwrap().clone(), + columns: Vec::new(), + }) + } + + *name = ast::ObjectName(vec![ast::ObjectNamePart::Identifier(ast::Ident::new( + cte_table, + ))]); + } +} + +impl VisitorMut for TableReplacer { + type Break = (); + + fn pre_visit_table_factor( + &mut self, + table_factor: &mut ast::TableFactor, + ) -> ControlFlow { + self.visit_table_factor(table_factor); + ControlFlow::Continue(()) + } +} + +#[cfg(test)] +mod tests { + use super::super::parse_query; + use super::*; + + #[test] + fn query_with_one_table_reference_is_wrapped_with_block_range() { + let query = parse_query("SELECT a, b, c FROM d").unwrap(); + let block_number_column = "b"; + let block_range = 0..=1_000_000; + let block_range_query = new_block_range_query(&query, block_number_column, &block_range); + + assert_eq!( + block_range_query, + parse_query( + " + WITH block_range_14621009630487609643 AS ( + SELECT * FROM d WHERE _block_num BETWEEN 0 AND 1000000 + ), + source_14621009630487609643 AS ( + SELECT a, b, c FROM block_range_14621009630487609643 AS d + ) + SELECT + source_14621009630487609643.* + FROM + source_14621009630487609643 + ORDER BY + source_14621009630487609643.b + " + ) + .unwrap(), + ) + } + + #[test] + fn query_with_multiple_table_references_is_wrapped_with_block_range() { + let query = parse_query("SELECT a, b, c FROM d JOIN e ON e.e = d.d").unwrap(); + let block_number_column = "b"; + let block_range = 0..=1_000_000; + let block_range_query = new_block_range_query(&query, block_number_column, &block_range); + + assert_eq!( + block_range_query, + parse_query( + " + WITH block_range_14621009630487609643 AS ( + SELECT * FROM d WHERE _block_num BETWEEN 0 AND 1000000 + ), + block_range_12377422807768256314 AS ( + SELECT * FROM e WHERE _block_num BETWEEN 0 AND 1000000 + ), + source_12377422807768256314 AS ( + SELECT a, b, c FROM block_range_14621009630487609643 AS d JOIN block_range_12377422807768256314 AS e ON e.e = d.d + ) + SELECT + source_12377422807768256314.* + FROM + source_12377422807768256314 + ORDER BY + source_12377422807768256314.b + " + ) + .unwrap(), + ) + } +} diff --git a/graph/src/amp/sql/query_builder/context_query.rs b/graph/src/amp/sql/query_builder/context_query.rs new file mode 100644 index 00000000000..cdff33ca4a3 --- /dev/null +++ b/graph/src/amp/sql/query_builder/context_query.rs @@ -0,0 +1,103 @@ +use ahash::RandomState; +use itertools::Itertools; +use sqlparser_latest::ast; + +use super::parse_query; + +/// Wraps the SQL query with additional context columns from a separate dataset. +/// +/// Creates two CTEs: one wrapping the input `query` and another loading context columns +/// from the specified context dataset and table. Joins both CTEs on block numbers to +/// include the context columns in the original query's output. +/// +/// This enables including columns required by Amp subgraphs in the original SQL query. +pub(super) fn new_context_query<'a>( + query: &ast::Query, + block_number_column: &str, + context_dataset: &str, + context_table: &str, + context_columns: impl IntoIterator, +) -> ast::Query { + // CTE names are unique within a SQL query. + // The hasher ensures that CTEs created for context do not collide with user-defined CTEs. + // Constant seeds ensure consistent context queries for the same input parameters. + let hasher = RandomState::with_seeds(0, 0, 0, 0); + let query_hash = hasher.hash_one(query); + + let context_columns = context_columns.into_iter().collect_vec(); + assert!(!context_columns.is_empty()); + + let context_cte = format!("context_{query_hash}"); + let source_cte = format!("source_{query_hash}"); + + let context_query = format!( + " + WITH {context_cte} AS ( + SELECT DISTINCT _block_num, {input_context_columns} FROM {context_dataset}.{context_table} + ), + {source_cte} AS ( + {query} + ) + SELECT + {output_context_columns}, + {source_cte}.* + FROM + {source_cte} + INNER JOIN {context_cte} ON + {context_cte}._block_num = {source_cte}.{block_number_column} + ", + input_context_columns = context_columns.join(", "), + output_context_columns = context_columns + .iter() + .map(|context_column| format!("{context_cte}.{context_column}")) + .join(", "), + ); + + parse_query(context_query).unwrap() +} + +#[cfg(test)] +mod tests { + use super::super::parse_query; + use super::*; + + #[test] + fn query_is_wrapped_with_context() { + let query = parse_query("SELECT a, b, c FROM d").unwrap(); + let block_number_column = "b"; + let context_dataset = "cx_a"; + let context_table = "cx_b"; + let context_columns = ["cx_c", "cx_d"]; + + let context_query = new_context_query( + &query, + block_number_column, + context_dataset, + context_table, + context_columns, + ); + + assert_eq!( + context_query, + parse_query( + " + WITH context_10500256449332496249 AS ( + SELECT DISTINCT _block_num, cx_c, cx_d FROM cx_a.cx_b + ), + source_10500256449332496249 AS ( + SELECT a, b, c FROM d + ) + SELECT + context_10500256449332496249.cx_c, + context_10500256449332496249.cx_d, + source_10500256449332496249.* + FROM + source_10500256449332496249 + INNER JOIN context_10500256449332496249 ON + context_10500256449332496249._block_num = source_10500256449332496249.b + " + ) + .unwrap() + ) + } +} diff --git a/graph/src/amp/sql/query_builder/event_signature_resolver.rs b/graph/src/amp/sql/query_builder/event_signature_resolver.rs new file mode 100644 index 00000000000..89ab8a31a51 --- /dev/null +++ b/graph/src/amp/sql/query_builder/event_signature_resolver.rs @@ -0,0 +1,183 @@ +use std::ops::ControlFlow; + +use alloy::json_abi::JsonAbi; +use anyhow::{bail, Context, Result}; +use sqlparser_latest::ast::{self, visit_expressions_mut}; + +static FUNCTION_NAME: &str = "sg_event_signature"; + +/// Replaces `sg_event_signature('CONTRACT_NAME', 'EVENT_NAME')` function calls with +/// the correct event signature based on `abis`. +/// +/// # Errors +/// +/// Returns an error if: +/// - The function is called with incorrect arguments +/// - The contract name is not found in `abis` +/// - The event name is not found in `abis` +/// +/// The returned error is deterministic. +pub(super) fn resolve_event_signatures( + query: &mut ast::Query, + abis: &[(&str, &JsonAbi)], +) -> Result<()> { + let visit_result = visit_expressions_mut(query, |expr| match visit_expr(expr, abis) { + Ok(()) => ControlFlow::Continue(()), + Err(e) => ControlFlow::Break(e), + }); + + if let ControlFlow::Break(e) = visit_result { + return Err(e).with_context(|| format!("failed to resolve '{FUNCTION_NAME}' calls")); + } + + Ok(()) +} + +fn visit_expr(expr: &mut ast::Expr, abis: &[(&str, &JsonAbi)]) -> Result<()> { + let ast::Expr::Function(function) = expr else { + return Ok(()); + }; + + if !FUNCTION_NAME.eq_ignore_ascii_case(&function.name.to_string()) { + return Ok(()); + } + + let Some((contract_name, event_name)) = get_args(function) else { + bail!("invalid function call: expected `{FUNCTION_NAME}('CONTRACT_NAME', 'EVENT_NAME')`, found: `{function}`"); + }; + + let Some(event) = get_event(abis, contract_name, event_name) else { + bail!("invalid function call: unknown contract '{contract_name}' or event '{event_name}'"); + }; + + let signature = ast::Value::SingleQuotedString(event.full_signature()).with_empty_span(); + *expr = ast::Expr::Value(signature); + + Ok(()) +} + +fn get_args<'a>(function: &'a ast::Function) -> Option<(&'a str, &'a str)> { + let ast::FunctionArguments::List(args) = &function.args else { + return None; + }; + + if args.args.len() != 2 { + return None; + } + + match (get_arg(&args.args[0]), get_arg(&args.args[1])) { + (Some(contract_name), Some(event_name)) => Some((contract_name, event_name)), + _ => None, + } +} + +fn get_arg<'a>(arg: &'a ast::FunctionArg) -> Option<&'a str> { + let ast::FunctionArg::Unnamed(ast::FunctionArgExpr::Expr(expr)) = arg else { + return None; + }; + + match expr { + ast::Expr::Value(ast::ValueWithSpan { + value: ast::Value::SingleQuotedString(value), + .. + }) if !value.is_empty() => Some(value), + _ => None, + } +} + +fn get_event<'a>( + abis: &'a [(&str, &JsonAbi)], + contract_name: &str, + event_name: &str, +) -> Option<&'a alloy::json_abi::Event> { + abis.iter() + .filter(|(name, _)| *name == contract_name) + .map(|(_, contract)| contract.event(event_name)) + .flatten() + .map(|events| events.first()) + .flatten() + .next() +} + +#[cfg(test)] +mod tests { + use super::super::parse_query; + use super::*; + + use self::fixtures::*; + + mod fixtures { + use std::sync::LazyLock; + + use super::*; + + pub(super) static ABIS: LazyLock> = LazyLock::new(|| { + vec![ + ("ContractA", JsonAbi::parse([&*event("TransferA")]).unwrap()), + ("ContractB", JsonAbi::parse([&*event("TransferB")]).unwrap()), + ("ContractB", JsonAbi::parse([&*event("TransferC")]).unwrap()), + ] + }); + + pub(super) fn event(name: &str) -> String { + format!("event {name}(address indexed from, address indexed to, address value)") + } + } + + macro_rules! test_resolve_event_signatures { + ($($name:ident: $query:expr => $expected:expr),* $(,)?) => { + $( + #[test] + fn $name() { + let mut query = parse_query($query).unwrap(); + let abis = ABIS.iter().map(|abi| (abi.0, &abi.1)).collect::>(); + let result = resolve_event_signatures(&mut query, &abis); + + match $expected { + Result::<&str, ()>::Ok(expected) => { + result.unwrap(); + assert_eq!(query, parse_query(expected).unwrap()); + }, + Err(_) => { + result.unwrap_err(); + } + } + } + )* + }; + } + + test_resolve_event_signatures! { + nothing_to_resolve: "SELECT a FROM b" => Ok("SELECT a FROM b"), + + call_with_no_arguments: "SELECT a FROM b WHERE c = sg_event_signature()" => Err(()), + call_with_one_argument: "SELECT a FROM b WHERE c = sg_event_signature('ContractA')" => Err(()), + call_with_first_invalid_argument: "SELECT a FROM b WHERE c = sg_event_signature(ContractA, 'TransferA')" => Err(()), + call_with_second_invalid_argument: "SELECT a FROM b WHERE c = sg_event_signature('ContractA', TransferA)" => Err(()), + call_with_two_invalid_arguments: "SELECT a FROM b WHERE c = sg_event_signature(ContractA, TransferA)" => Err(()), + call_with_unknown_contract: "SELECT a FROM b WHERE c = sg_event_signature('ContractX', 'TransferA')" => Err(()), + call_with_unknown_event: "SELECT a FROM b WHERE c = sg_event_signature('ContractA', 'TransferX')" => Err(()), + call_with_contract_and_event_mismatch: "SELECT a FROM b WHERE c = sg_event_signature('ContractA', 'TransferB')" => Err(()), + call_with_invalid_argument_cases: "SELECT a FROM b WHERE c = sg_event_signature('contractA', 'transferA')" => Err(()), + + resolve_one_call: + "SELECT a FROM b WHERE c = sg_event_signature('ContractA', 'TransferA')" => + Ok(&*format!("SELECT a FROM b WHERE c = '{}'", event("TransferA"))), + + resolve_multiple_calls: + "SELECT a FROM b WHERE c = sg_event_signature('ContractA', 'TransferA') OR d = sg_event_signature('ContractA', 'TransferA')" => + Ok(&*format!("SELECT a FROM b WHERE c = '{}' OR d = '{}'", event("TransferA"), event("TransferA"))), + + resolve_multiple_calls_with_different_arguments: + "SELECT a FROM b WHERE c = sg_event_signature('ContractA', 'TransferA') OR d = sg_event_signature('ContractB', 'TransferB')" => + Ok(&*format!("SELECT a FROM b WHERE c = '{}' OR d = '{}'", event("TransferA"), event("TransferB"))), + + resolve_multiple_calls_with_events_from_different_abis_with_the_same_name: + "SELECT a FROM b WHERE c = sg_event_signature('ContractB', 'TransferB') OR d = sg_event_signature('ContractB', 'TransferC')" => + Ok(&*format!("SELECT a FROM b WHERE c = '{}' OR d = '{}'", event("TransferB"), event("TransferC"))), + + resolve_calls_with_case_insensitive_function_name: + "SELECT a FROM b WHERE c = sg_Event_SIGNATURE('ContractA', 'TransferA')" => + Ok(&*format!("SELECT a FROM b WHERE c = '{}'", event("TransferA"))), + } +} diff --git a/graph/src/amp/sql/query_builder/mod.rs b/graph/src/amp/sql/query_builder/mod.rs new file mode 100644 index 00000000000..8a16b1a831f --- /dev/null +++ b/graph/src/amp/sql/query_builder/mod.rs @@ -0,0 +1,191 @@ +mod block_range_query; +mod context_query; +mod event_signature_resolver; +mod parser; +mod source_address_resolver; +mod table_extractor; +mod table_validator; + +use std::{fmt, ops::RangeInclusive}; + +use alloy::{ + json_abi::JsonAbi, + primitives::{Address, BlockNumber}, +}; +use anyhow::{bail, Context, Result}; +use itertools::Itertools; +use sqlparser_latest::ast; + +use self::{ + block_range_query::new_block_range_query, + context_query::new_context_query, + event_signature_resolver::resolve_event_signatures, + parser::parse_query, + source_address_resolver::resolve_source_address, + table_extractor::{extract_tables, normalize_table}, + table_validator::validate_tables, +}; + +/// Represents a valid SQL query that can be executed on an Amp server. +#[derive(Debug, Clone)] +pub struct ValidQuery { + query: ast::Query, +} + +impl ValidQuery { + /// Parses, validates and resolves the input SQL query. + /// + /// # Errors + /// + /// Returns an error if: + /// - The SQL query cannot be parsed + /// - The SQL query is not valid + /// - The SQL query cannot be resolved + /// + /// The returned error is deterministic. + pub fn new<'a>( + sql: &str, + dataset: &str, + tables: impl IntoIterator, + source_address: &Address, + abis: impl IntoIterator, + ) -> Result { + let mut query = parse_query(sql).context("failed to parse SQL query")?; + + Self::validate(&query, dataset, tables).context("failed to validate SQL query")?; + Self::resolve(&mut query, source_address, abis).context("failed to resolve SQL query")?; + + Ok(Self { query }) + } + + /// Validates the SQL query. + /// + /// # Errors + /// + /// Returns an error if: + /// - The SQL query references unknown datasets or tables + /// - The SQL query uses custom `SETTINGS` + /// + /// The returned error is deterministic. + fn validate<'a>( + query: &ast::Query, + dataset: &str, + tables: impl IntoIterator, + ) -> Result<()> { + validate_tables(query, dataset, tables)?; + + if query.settings.is_some() { + bail!("custom SETTINGS are not allowed"); + } + + Ok(()) + } + + /// Resolves subgraph-specific function calls in the SQL query. + /// + /// # Errors + /// + /// Returns an error if: + /// - Source address function calls cannot be resolved + /// - Event signature function calls cannot be resolved + /// + /// The returned error is deterministic. + fn resolve<'a>( + query: &mut ast::Query, + source_address: &Address, + abis: impl IntoIterator, + ) -> Result<()> { + resolve_source_address(query, source_address)?; + resolve_event_signatures(query, &abis.into_iter().collect_vec())?; + + Ok(()) + } +} + +impl fmt::Display for ValidQuery { + fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result { + write!(f, "{}", self.query) + } +} + +/// Represents a valid SQL query that contains columns required by Amp subgraphs. +#[derive(Debug, Clone)] +pub struct ContextQuery { + query: ast::Query, + block_number_column: String, +} + +impl ContextQuery { + /// Wraps the SQL query with additional context columns from a separate dataset. + /// + /// Creates two CTEs: one wrapping the input `query` and another loading context columns + /// from the specified context dataset and table. Joins both CTEs on block numbers to + /// include the context columns in the original query's output. + /// + /// This enables including columns required by Amp subgraphs in the original SQL query. + pub fn new<'a>( + valid_query: ValidQuery, + block_number_column: &str, + context_dataset: &str, + context_table: &str, + context_columns: impl IntoIterator, + ) -> Self { + let ValidQuery { query } = valid_query; + + let query = new_context_query( + &query, + block_number_column, + context_dataset, + context_table, + context_columns, + ); + + Self { + query, + block_number_column: block_number_column.to_string(), + } + } +} + +/// Builds valid SQL queries for execution on an Amp server with block range limits. +#[derive(Debug, Clone)] +pub struct BlockRangeQueryBuilder { + query: ast::Query, + block_number_column: String, +} + +impl BlockRangeQueryBuilder { + /// Creates a new block range query builder with the specified valid SQL query. + pub fn new(valid_query: ValidQuery, block_number_column: &str) -> Self { + let ValidQuery { query } = valid_query; + + Self { + query, + block_number_column: block_number_column.to_string(), + } + } + + /// Creates a new block range query builder with the specified context SQL query. + pub fn new_with_context(context_query: ContextQuery) -> Self { + let ContextQuery { + query, + block_number_column, + } = context_query; + + Self { + query, + block_number_column, + } + } + + /// Limits the query execution to the specified block range. + /// + /// Wraps this SQL query in a CTE, and creates CTEs for every table it references. + /// These CTEs load data from the referenced tables only on the specified block range. + /// All the table references in the original SQL query are replaced with the created CTE names. + /// + /// The output is ordered by block numbers. + pub fn build_with_block_range(&self, block_range: &RangeInclusive) -> String { + new_block_range_query(&self.query, &self.block_number_column, block_range).to_string() + } +} diff --git a/graph/src/amp/sql/query_builder/parser.rs b/graph/src/amp/sql/query_builder/parser.rs new file mode 100644 index 00000000000..31ff4d068c8 --- /dev/null +++ b/graph/src/amp/sql/query_builder/parser.rs @@ -0,0 +1,159 @@ +use std::ops::ControlFlow; + +use anyhow::{anyhow, bail, Context, Result}; +use itertools::Itertools; +use sqlparser_latest::{ + ast::{self, Visit, Visitor}, + dialect::GenericDialect, + parser::Parser, +}; + +/// Parses a SQL query and returns its AST. +/// +/// # Errors +/// +/// Returns an error if: +/// - The SQL query cannot be parsed +/// - The SQL query contains multiple SQL statements +/// - The SQL query is not a `SELECT` query +/// - The SQL query contains CTEs with quoted names +/// +/// The returned error is deterministic. +pub(super) fn parse_query(s: impl AsRef) -> Result { + let statement = Parser::parse_sql(&GenericDialect {}, s.as_ref()) + .context("invalid SQL query")? + .into_iter() + .exactly_one() + .map_err(|e| anyhow!("expected exactly one SQL statement, found {}", e.count()))?; + + let query = match statement { + ast::Statement::Query(query) => *query, + _ => bail!("invalid SQL query: only SELECT statements are allowed"), + }; + + if let ControlFlow::Break(e) = query.visit(&mut AllowOnlySelectQueries) { + return Err(e); + } + + if let ControlFlow::Break(e) = query.visit(&mut AllowOnlyUnquotedCtes) { + return Err(e); + } + + Ok(query) +} + +/// Validates that the SQL query AST contains only `SELECT` queries in subqueries. +struct AllowOnlySelectQueries; + +impl AllowOnlySelectQueries { + /// Returns an error if the `set_expr` is not a `SELECT` expression. + fn visit_set_expr(&self, set_expr: &ast::SetExpr) -> Result<()> { + match set_expr { + ast::SetExpr::Select(_) + | ast::SetExpr::Query(_) + | ast::SetExpr::Values(_) + | ast::SetExpr::Table(_) => Ok(()), + ast::SetExpr::SetOperation { left, right, .. } => { + self.visit_set_expr(left)?; + self.visit_set_expr(right)?; + Ok(()) + } + ast::SetExpr::Insert(_) | ast::SetExpr::Update(_) | ast::SetExpr::Delete(_) => { + bail!("invalid SQL query: only SELECT queries are allowed") + } + } + } +} + +impl Visitor for AllowOnlySelectQueries { + type Break = anyhow::Error; + + fn pre_visit_query(&mut self, query: &ast::Query) -> ControlFlow { + match self.visit_set_expr(&query.body) { + Ok(()) => ControlFlow::Continue(()), + Err(e) => ControlFlow::Break(e), + } + } +} + +/// Validates that CTE names in the SQL query AST do not use quotes. +/// +/// This is a temporary solution that allows proper identification of table references. +struct AllowOnlyUnquotedCtes; + +impl AllowOnlyUnquotedCtes { + /// Returns an error if the `query` contains CTEs with quoted names. + fn visit_query(&self, query: &ast::Query) -> Result<()> { + let Some(with) = &query.with else { + return Ok(()); + }; + + for cte_table in &with.cte_tables { + let cte_name = &cte_table.alias.name; + + if cte_name.quote_style.is_some() { + bail!("invalid CTE {cte_name}: CTE names with quotes are not allowed"); + } + } + + Ok(()) + } +} + +impl Visitor for AllowOnlyUnquotedCtes { + type Break = anyhow::Error; + + fn pre_visit_query(&mut self, query: &ast::Query) -> ControlFlow { + match self.visit_query(query) { + Ok(()) => ControlFlow::Continue(()), + Err(e) => ControlFlow::Break(e), + } + } +} + +#[cfg(test)] +mod tests { + use super::*; + + macro_rules! test_parse_query { + ($($name:ident: $input:expr => $expected:expr),* $(,)?) => { + $( + #[test] + fn $name() { + let result = parse_query($input); + + match $expected { + Result::<&str, &str>::Ok(expected) => { + assert_eq!(result.unwrap().to_string(), expected); + }, + Err(e) => { + assert_eq!(result.unwrap_err().to_string(), e); + } + } + } + )* + }; + } + + test_parse_query! { + invalid_query: "SELECT" => Err("invalid SQL query"), + multiple_statements: "SELECT a FROM b; SELECT c FROM d" => Err("expected exactly one SQL statement, found 2"), + insert_statement: "INSERT INTO a VALUES (b)" => Err("invalid SQL query: only SELECT statements are allowed"), + update_statement: "UPDATE a SET b = c" => Err("invalid SQL query: only SELECT statements are allowed"), + delete_statement: "DELETE FROM a WHERE b = c" => Err("invalid SQL query: only SELECT statements are allowed"), + truncate_statement: "TRUNCATE TABLE a" => Err("invalid SQL query: only SELECT statements are allowed"), + drop_statement: "DROP TABLE a" => Err("invalid SQL query: only SELECT statements are allowed"), + + nested_insert_query: "WITH a AS (INSERT INTO b VALUES (c) RETURNING d) SELECT * FROM a" => Err("invalid SQL query: only SELECT queries are allowed"), + nested_update_query: "WITH a AS (UPDATE b SET c = d RETURNING e) SELECT * FROM a" => Err("invalid SQL query: only SELECT queries are allowed"), + nested_delete_query: "WITH a AS (DELETE FROM b WHERE c = d RETURNING e) SELECT * FROM a" => Err("invalid SQL query: only SELECT queries are allowed"), + + valid_query: "SELECT a FROM b" => Ok("SELECT a FROM b"), + valid_query_with_cte: "WITH a AS (SELECT b FROM c) SELECT * FROM a" => Ok("WITH a AS (SELECT b FROM c) SELECT * FROM a"), + valid_query_with_join: "SELECT a FROM b INNER JOIN c ON c.c = b.b" => Ok("SELECT a FROM b INNER JOIN c ON c.c = b.b"), + + single_quoted_ctes_not_allowed: "WITH 'a' AS (SELECT * FROM b) SELECT * FROM a" => Err("invalid CTE 'a': CTE names with quotes are not allowed"), + double_quoted_ctes_not_allowed: r#"WITH "a" AS (SELECT * FROM b) SELECT * FROM a"# => Err(r#"invalid CTE "a": CTE names with quotes are not allowed"#), + backticked_ctes_not_allowed: "WITH `a` AS (SELECT * FROM b) SELECT * FROM a" => Err("invalid CTE `a`: CTE names with quotes are not allowed"), + } +} diff --git a/graph/src/amp/sql/query_builder/source_address_resolver.rs b/graph/src/amp/sql/query_builder/source_address_resolver.rs new file mode 100644 index 00000000000..579e0873bb6 --- /dev/null +++ b/graph/src/amp/sql/query_builder/source_address_resolver.rs @@ -0,0 +1,133 @@ +use std::ops::ControlFlow; + +use alloy::primitives::Address; +use anyhow::{bail, Context, Result}; +use sqlparser_latest::ast::{self, visit_expressions_mut}; + +static FUNCTION_NAME: &str = "sg_source_address"; + +/// Replaces `sg_source_address()` function calls in the SQL query with the `source_address`. +/// +/// # Errors +/// +/// Returns an error if the function is called with any arguments. +/// +/// The returned error is deterministic. +pub(super) fn resolve_source_address( + query: &mut ast::Query, + source_address: &Address, +) -> Result<()> { + let visit_result = + visit_expressions_mut(query, |expr| match visit_expr(expr, source_address) { + Ok(()) => ControlFlow::Continue(()), + Err(e) => ControlFlow::Break(e), + }); + + if let ControlFlow::Break(e) = visit_result { + return Err(e).with_context(|| format!("failed to resolve '{FUNCTION_NAME}' calls")); + } + + Ok(()) +} + +fn visit_expr(expr: &mut ast::Expr, source_address: &Address) -> Result<()> { + let ast::Expr::Function(function) = expr else { + return Ok(()); + }; + + if !FUNCTION_NAME.eq_ignore_ascii_case(&function.name.to_string()) { + return Ok(()); + } + + match &function.args { + ast::FunctionArguments::None => {} + ast::FunctionArguments::List(args) if args.args.is_empty() => {} + _ => { + bail!("invalid function call: function '{FUNCTION_NAME}' does not accept arguments"); + } + } + + *function = ast::Function { + name: ast::ObjectName(vec![ast::ObjectNamePart::Identifier(ast::Ident::new( + "arrow_cast", + ))]), + uses_odbc_syntax: false, + parameters: ast::FunctionArguments::None, + args: ast::FunctionArguments::List(ast::FunctionArgumentList { + duplicate_treatment: None, + args: vec![ + ast::FunctionArg::Unnamed(ast::FunctionArgExpr::Expr(ast::Expr::Value( + ast::Value::HexStringLiteral(hex::encode(source_address)).with_empty_span(), + ))), + ast::FunctionArg::Unnamed(ast::FunctionArgExpr::Expr(ast::Expr::Value( + ast::Value::SingleQuotedString("FixedSizeBinary(20)".to_string()) + .with_empty_span(), + ))), + ], + clauses: vec![], + }), + filter: None, + null_treatment: None, + over: None, + within_group: vec![], + }; + + Ok(()) +} + +#[cfg(test)] +mod tests { + use super::super::parse_query; + use super::*; + + use self::fixtures::*; + + mod fixtures { + use super::*; + + pub(super) const SOURCE_ADDRESS: Address = Address::ZERO; + + pub(super) const RESOLVED_FUNCTION_CALL: &str = + "arrow_cast(X'0000000000000000000000000000000000000000', 'FixedSizeBinary(20)')"; + } + + macro_rules! test_resolve_source_address { + ($($name:ident: $query:expr => $expected:expr),* $(,)?) => { + $( + #[test] + fn $name() { + let mut query = parse_query($query).unwrap(); + let result = resolve_source_address(&mut query, &SOURCE_ADDRESS); + + match $expected { + Result::<&str, ()>::Ok(expected) => { + result.unwrap(); + assert_eq!(query, parse_query(expected).unwrap()); + }, + Err(_) => { + result.unwrap_err(); + } + } + } + )* + }; + } + + test_resolve_source_address! { + nothing_to_resolve: "SELECT a FROM b" => Ok("SELECT a FROM b"), + call_with_one_argument: "SELECT a FROM b WHERE c = sg_source_address(d)" => Err(()), + call_with_multiple_argument: "SELECT a FROM b WHERE c = sg_source_address(d, e)" => Err(()), + + resolve_one_call: + "SELECT a FROM b WHERE c = sg_source_address()" => + Ok(&*format!("SELECT a FROM b WHERE c = {RESOLVED_FUNCTION_CALL}")), + + resolve_multiple_calls: + "SELECT a FROM b WHERE c = sg_source_address() OR d = sg_source_address()" => + Ok(&*format!("SELECT a FROM b WHERE c = {RESOLVED_FUNCTION_CALL} OR d = {RESOLVED_FUNCTION_CALL}")), + + resolve_calls_with_case_insensitive_function_name: + "SELECT a FROM b WHERE c = sg_Source_ADDRESS()" => + Ok(&*format!("SELECT a FROM b WHERE c = {RESOLVED_FUNCTION_CALL}")), + } +} diff --git a/graph/src/amp/sql/query_builder/table_extractor.rs b/graph/src/amp/sql/query_builder/table_extractor.rs new file mode 100644 index 00000000000..0161e55fd49 --- /dev/null +++ b/graph/src/amp/sql/query_builder/table_extractor.rs @@ -0,0 +1,153 @@ +use std::{collections::BTreeSet, ops::ControlFlow}; + +use itertools::Itertools; +use sqlparser_latest::ast::{self, Visit, Visitor}; + +/// Returns all tables that are referenced by the SQL query. +/// +/// The table names are lowercased and quotes are ignored. +pub(super) fn extract_tables(query: &ast::Query) -> BTreeSet { + let mut table_extractor = TableExtractor::new(); + let _: ControlFlow<()> = Visit::visit(query, &mut table_extractor); + + table_extractor.tables +} + +/// Returns the normalized table name. +/// +/// The table name is lowercased and quotes are ignored. +pub(super) fn normalize_table(object_name: &ast::ObjectName) -> String { + object_name + .0 + .iter() + .map(|part| match part { + ast::ObjectNamePart::Identifier(ident) => ident.value.to_lowercase(), + }) + .join(".") +} + +/// Visits the SQL query AST and extracts referenced table names, ignoring CTEs. +struct TableExtractor { + tables: BTreeSet, + cte_stack: CteStack, +} + +impl TableExtractor { + /// Creates a new empty table extractor. + fn new() -> Self { + Self { + tables: BTreeSet::new(), + cte_stack: CteStack::new(), + } + } + + /// Extracts and stores the table name from the current `table_factor`. + fn visit_table_factor(&mut self, table_factor: &ast::TableFactor) { + let ast::TableFactor::Table { name, .. } = table_factor else { + return; + }; + + let table = normalize_table(name); + + if self.cte_stack.contains(&table) { + return; + } + + self.tables.insert(table); + } +} + +impl Visitor for TableExtractor { + type Break = (); + + fn pre_visit_query(&mut self, query: &ast::Query) -> ControlFlow { + self.cte_stack.pre_visit_query(query); + ControlFlow::Continue(()) + } + + fn post_visit_query(&mut self, _query: &ast::Query) -> ControlFlow { + self.cte_stack.post_visit_query(); + ControlFlow::Continue(()) + } + + fn pre_visit_table_factor( + &mut self, + table_factor: &ast::TableFactor, + ) -> ControlFlow { + self.visit_table_factor(table_factor); + ControlFlow::Continue(()) + } +} + +/// Maintains a list of active CTEs for each subquery scope. +struct CteStack { + stack: Vec>, +} + +impl CteStack { + /// Creates a new empty CTE stack. + fn new() -> Self { + Self { stack: Vec::new() } + } + + /// Returns `true` if the `table_name` is present in the CTE list at any scope. + fn contains(&self, table_name: &str) -> bool { + self.stack.iter().any(|scope| scope.contains(table_name)) + } + + /// Creates a new subquery scope with all the CTEs of the current `query`. + fn pre_visit_query(&mut self, query: &ast::Query) { + let cte_tables = match &query.with { + Some(with) => with + .cte_tables + .iter() + .map(|cte_table| cte_table.alias.name.value.to_lowercase()) + .collect(), + None => BTreeSet::new(), + }; + + self.stack.push(cte_tables); + } + + /// Removes all the CTEs from the most recent subquery scope. + fn post_visit_query(&mut self) { + self.stack.pop(); + } +} + +#[cfg(test)] +mod tests { + use super::super::parse_query; + use super::*; + + macro_rules! test_extract_tables { + ($($name:ident: $input:expr => $expected:expr),* $(,)?) => { + $( + #[test] + fn $name() { + let query = parse_query($input).unwrap(); + assert_eq!(extract_tables(&query), $expected.into_iter().map(Into::into).collect()); + } + )* + }; + } + + test_extract_tables! { + one_table: "SELECT a FROM b" => ["b"], + multiple_tables_with_one_join: "SELECT a FROM b JOIN c ON c.c = b.b" => ["b", "c"], + multiple_tables_with_multiple_joins: "SELECT a FROM b JOIN c ON c.c = b.b JOIN d ON d.d = b.b" => ["b", "c", "d"], + one_table_with_one_cte: "WITH a AS (SELECT * FROM b) SELECT * FROM a" => ["b"], + one_table_with_multiple_ctes: "WITH a AS (SELECT * FROM b), c AS (SELECT * FROM a) SELECT * FROM c" => ["b"], + multiple_tables_with_multiple_ctes: "WITH a AS (SELECT * FROM b), c AS (SELECT * FROM d) SELECT * FROM a JOIN c ON c.c = a.a" => ["b", "d"], + multiple_tables_with_nested_ctes: "WITH a AS (WITH b AS (SELECT * FROM c) SELECT * FROM d JOIN b ON b.b = d.d) SELECT * FROM a" => ["c", "d"], + multiple_tables_with_union: "SELECT a FROM b UNION SELECT c FROM d" => ["b", "d"], + multiple_tables_with_union_all: "SELECT a FROM b UNION ALL SELECT c FROM d" => ["b", "d"], + + namespace_is_preserved: "SELECT a FROM b.c" => ["b.c"], + catalog_is_preserved: "SELECT a FROM b.c.d" => ["b.c.d"], + tables_are_lowercased: "SELECT a FROM B.C" => ["b.c"], + single_quotes_in_tables_are_ignored: "SELECT a FROM 'B'.'C'" => ["b.c"], + double_quotes_in_tables_are_ignored: r#"SELECT a FROM "B"."C""# => ["b.c"], + backticks_in_tables_are_ignored: "SELECT a FROM `B`.`C`" => ["b.c"], + } +} diff --git a/graph/src/amp/sql/query_builder/table_validator.rs b/graph/src/amp/sql/query_builder/table_validator.rs new file mode 100644 index 00000000000..d1cd256c9f2 --- /dev/null +++ b/graph/src/amp/sql/query_builder/table_validator.rs @@ -0,0 +1,94 @@ +use std::collections::BTreeSet; + +use anyhow::{bail, Result}; +use sqlparser_latest::ast; + +use super::extract_tables; + +/// Validates that SQL query references only allowed dataset and tables. +/// +/// # Errors +/// +/// Returns an error if: +/// - The `query` does not reference any tables +/// - The `query` references a table not in `allowed_tables` +/// - The `query` references a dataset other than `allowed_dataset` +/// +/// The returned error is deterministic. +pub(super) fn validate_tables<'a>( + query: &ast::Query, + allowed_dataset: &str, + allowed_tables: impl IntoIterator, +) -> Result<()> { + let used_tables = extract_tables(query); + + if used_tables.is_empty() { + bail!("query does not use any tables"); + } + + let allowed_dataset = allowed_dataset.to_lowercase(); + let allowed_tables = allowed_tables + .into_iter() + .map(|allowed_table| format!("{allowed_dataset}.{}", allowed_table.to_lowercase())) + .collect::>(); + + for used_table in used_tables { + if !allowed_tables.contains(&used_table) { + bail!("table '{used_table}' not allowed"); + } + } + + Ok(()) +} + +#[cfg(test)] +mod tests { + use super::super::parse_query; + use super::*; + + macro_rules! test_validate_tables { + ($($name:ident: $input:expr, $dataset:expr, $tables:expr => $expected:expr),* $(,)?) => { + $( + #[test] + fn $name() { + let query = parse_query($input).unwrap(); + let result = validate_tables(&query, $dataset, $tables); + + match $expected { + Result::<(), &str>::Ok(()) => { + result.unwrap(); + }, + Err(e) => { + assert_eq!(result.unwrap_err().to_string(), e); + } + } + } + )* + }; + } + + test_validate_tables! { + no_table_references: "SELECT *", "a", ["b"] => Err("query does not use any tables"), + missing_dataset: "SELECT * FROM b", "a", ["b"] => Err("table 'b' not allowed"), + missing_table: "SELECT * FROM a", "a", ["b"] => Err("table 'a' not allowed"), + invalid_dataset: "SELECT * FROM c.b", "a", ["b"] => Err("table 'c.b' not allowed"), + invalid_nested_dataset: "WITH a AS (SELECT * FROM c.b) SELECT * FROM a", "a", ["b"] => Err("table 'c.b' not allowed"), + invalid_table: "SELECT * FROM a.c", "a", ["b"] => Err("table 'a.c' not allowed"), + invalid_nested_table: "WITH a AS (SELECT * FROM a.c) SELECT * FROM a", "a", ["b"] => Err("table 'a.c' not allowed"), + using_catalog: "SELECT * FROM c.a.b", "a", ["b"] => Err("table 'c.a.b' not allowed"), + + one_valid_table: "SELECT * FROM a.b", "a", ["b"] => Ok(()), + one_valid_nested_table: "WITH a AS (SELECT * FROM a.b) SELECT * FROM a", "a", ["b"] => Ok(()), + multiple_valid_tables: "SELECT * FROM a.b JOIN a.c ON a.c.c = a.b.b", "a", ["b", "c"] => Ok(()), + multiple_valid_nested_tables: "WITH a AS (SELECT * FROM a.b JOIN a.c ON a.c.c = a.b.b) SELECT * FROM a", "a", ["b", "c"] => Ok(()), + + single_quotes_are_ignored: "SELECT * FROM 'a'.'b'", "a", ["b"] => Ok(()), + double_quotes_are_ignored: r#"SELECT * FROM "a"."b""#, "a", ["b"] => Ok(()), + backticks_are_ignored: "SELECT * FROM `a`.`b`", "a", ["b"] => Ok(()), + + dataset_is_case_insensitive: "SELECT * FROM A.b", "a", ["b"] => Ok(()), + tables_are_case_insensitive: "SELECT * FROM a.B", "a", ["b"] => Ok(()), + allowed_dataset_is_case_insensitive: "SELECT * FROM a.b", "A", ["b"] => Ok(()), + allowrd_tables_are_case_insensitive: "SELECT * FROM a.b", "a", ["B"] => Ok(()), + } +} diff --git a/graph/src/amp/stream_aggregator/record_batch/decoder.rs b/graph/src/amp/stream_aggregator/record_batch/decoder.rs index af94613b878..a2c5cf92daf 100644 --- a/graph/src/amp/stream_aggregator/record_batch/decoder.rs +++ b/graph/src/amp/stream_aggregator/record_batch/decoder.rs @@ -27,8 +27,8 @@ impl<'a> Decoder<'a> { /// The returned error is deterministic. pub(super) fn new(record_batch: &'a RecordBatch) -> Result { Ok(Self { - block_number: auto_block_number_decoder(record_batch)?, - block_hash: auto_block_hash_decoder(record_batch)?, + block_number: auto_block_number_decoder(record_batch)?.1, + block_hash: auto_block_hash_decoder(record_batch)?.1, }) } diff --git a/graph/src/nozzle/stream_aggregator/record_batch/group_data.rs b/graph/src/nozzle/stream_aggregator/record_batch/group_data.rs deleted file mode 100644 index 32d3317c585..00000000000 --- a/graph/src/nozzle/stream_aggregator/record_batch/group_data.rs +++ /dev/null @@ -1,88 +0,0 @@ -use std::sync::Arc; - -use anyhow::{Context, Result}; -use arrow::{ - array::{RecordBatch, UInt64Array}, - compute::{concat_batches, take_record_batch}, -}; - -/// Contains references to all record batches and rows of a group. -pub(super) struct GroupData { - parts: Vec, -} - -struct Part { - record_batch: Arc, - row_indices: Vec, -} - -impl GroupData { - /// Creates a new group with an initial `record_batch` and `row_index`. - pub(super) fn new(record_batch: Arc, row_index: usize) -> Self { - Self { - parts: vec![Part { - record_batch, - row_indices: vec![row_index as u64], - }], - } - } - - /// Adds a new `record_batch` and `row_index` to this group. - pub(super) fn add(&mut self, record_batch: Arc, row_index: usize) { - self.parts.push(Part { - record_batch, - row_indices: vec![row_index as u64], - }) - } - - /// Adds a `row_index` to the most recent record batch in this group. - /// - /// # Panics - /// - /// Panics if this group is empty. - pub(super) fn add_row_index(&mut self, row_index: usize) { - assert!(!self.parts.is_empty()); - - self.parts - .last_mut() - .unwrap() - .row_indices - .push(row_index as u64); - } - - /// Converts this group into a single record batch. - /// - /// Merges all group rows from all record batches together. - /// - /// # Errors - /// - /// Returns an error if the record batches in this group have incompatible types. - /// - /// The returned error is deterministic. - /// - /// # Panics - /// - /// Panics if: - /// - This group is empty - /// - This group contains invalid row indices - pub(super) fn into_record_batch(self) -> Result { - assert!(!self.parts.is_empty()); - - let schema = self.parts[0].record_batch.schema(); - let mut partial_record_batches = Vec::with_capacity(self.parts.len()); - - for part in self.parts { - let Part { - record_batch, - row_indices, - } = part; - - let row_indices = UInt64Array::from(row_indices); - let partial_record_batch = take_record_batch(&record_batch, &row_indices).unwrap(); - - partial_record_batches.push(partial_record_batch); - } - - concat_batches(&schema, &partial_record_batches).context("failed to merge record batches") - } -} From 4cadade559998ff8a2322409dc2b9c6cc3ec59a4 Mon Sep 17 00:00:00 2001 From: Ion Suman <47307091+isum@users.noreply.github.com> Date: Wed, 5 Nov 2025 17:59:25 +0200 Subject: [PATCH 24/40] chore(graph): fix typos --- graph/src/amp/client/flight_client.rs | 2 +- graph/src/amp/manifest/data_source/mod.rs | 2 +- graph/src/amp/manifest/data_source/raw.rs | 2 +- graph/src/amp/manifest/mod.rs | 2 +- graph/src/data/subgraph/mod.rs | 2 +- 5 files changed, 5 insertions(+), 5 deletions(-) diff --git a/graph/src/amp/client/flight_client.rs b/graph/src/amp/client/flight_client.rs index 9b5bc8075ed..04e5a76ab82 100644 --- a/graph/src/amp/client/flight_client.rs +++ b/graph/src/amp/client/flight_client.rs @@ -29,7 +29,7 @@ use crate::{ /// A client for the Amp Flight gRPC service. /// -/// This client connects to a Amp server and executes SQL queries +/// This client connects to an Amp server and executes SQL queries /// using the Apache Arrow Flight protocol. pub struct FlightClient { channel: Channel, diff --git a/graph/src/amp/manifest/data_source/mod.rs b/graph/src/amp/manifest/data_source/mod.rs index ff2a5003bb5..4ebd5af9aca 100644 --- a/graph/src/amp/manifest/data_source/mod.rs +++ b/graph/src/amp/manifest/data_source/mod.rs @@ -14,7 +14,7 @@ use crate::{ pub use self::raw::RawDataSource; -/// Represents a valid data source of a Amp subgraph. +/// Represents a valid data source of an Amp subgraph. /// /// This data source contains parsed, formatted, and resolved data. #[derive(Debug, Clone)] diff --git a/graph/src/amp/manifest/data_source/raw.rs b/graph/src/amp/manifest/data_source/raw.rs index 10e065544d3..3e6efe836ad 100644 --- a/graph/src/amp/manifest/data_source/raw.rs +++ b/graph/src/amp/manifest/data_source/raw.rs @@ -31,7 +31,7 @@ use crate::{ static API_VERSIONS: LazyLock> = LazyLock::new(|| HashSet::from([Version::new(0, 0, 1)])); -/// Represents an unmodified input data source of a Amp subgraph. +/// Represents an unmodified input data source of an Amp subgraph. /// /// May contain invalid or partial data. #[derive(Debug, Clone, Deserialize)] diff --git a/graph/src/amp/manifest/mod.rs b/graph/src/amp/manifest/mod.rs index 9a16da1f194..028d567332c 100644 --- a/graph/src/amp/manifest/mod.rs +++ b/graph/src/amp/manifest/mod.rs @@ -31,7 +31,7 @@ pub struct Manifest { /// The Amp data sources of the subgraph. /// - /// A Amp subgraph can only contain Amp data sources. + /// An Amp subgraph can only contain Amp data sources. pub data_sources: Vec, } diff --git a/graph/src/data/subgraph/mod.rs b/graph/src/data/subgraph/mod.rs index f0baac7766b..c9212d7a639 100644 --- a/graph/src/data/subgraph/mod.rs +++ b/graph/src/data/subgraph/mod.rs @@ -926,7 +926,7 @@ impl UnvalidatedSubgraphManifest { .collect_vec(); if amp_data_sources.is_empty() { - // Not a Amp subgraph + // Not an Amp subgraph return Vec::new(); } From 4d74833dfc16977e1d4c7ce42c88aec29faa0713 Mon Sep 17 00:00:00 2001 From: Ion Suman <47307091+isum@users.noreply.github.com> Date: Wed, 5 Nov 2025 18:37:07 +0200 Subject: [PATCH 25/40] fix(graph): use nozzle-resume header name --- graph/src/amp/client/flight_client.rs | 9 ++++++--- 1 file changed, 6 insertions(+), 3 deletions(-) diff --git a/graph/src/amp/client/flight_client.rs b/graph/src/amp/client/flight_client.rs index 04e5a76ab82..6b5c14ce94f 100644 --- a/graph/src/amp/client/flight_client.rs +++ b/graph/src/amp/client/flight_client.rs @@ -129,10 +129,13 @@ impl Client for FlightClient { .map(Into::into) .collect(); - raw_client.set_header( - "amp-resume", - serialize_resume_streaming_query(resume_streaming_query), + let metadata = serialize_resume_streaming_query(resume_streaming_query); + debug!(logger, "Setting request metadata"; + "amp-resume" => &metadata ); + + // TODO: Update the header name when the Amp server updates to the latest version + raw_client.set_header("nozzle-resume", metadata); } } From 5a8688c8c9631c8ce2ddead11806022a14523b87 Mon Sep 17 00:00:00 2001 From: Ion Suman <47307091+isum@users.noreply.github.com> Date: Thu, 6 Nov 2025 13:59:26 +0200 Subject: [PATCH 26/40] fix(graph): extend common column aliases --- graph/src/amp/common/mod.rs | 4 ++++ 1 file changed, 4 insertions(+) diff --git a/graph/src/amp/common/mod.rs b/graph/src/amp/common/mod.rs index 94d51495258..62e44170391 100644 --- a/graph/src/amp/common/mod.rs +++ b/graph/src/amp/common/mod.rs @@ -6,15 +6,19 @@ pub(super) mod column_aliases { pub(in crate::amp) static BLOCK_NUMBER: &[&str] = &[ "_block_num", // Meta column present in all tables "block_num", // Standard column in most raw tables + "blockNum", // Common alternative name "block", // Common alternative name "block_number", // Common alternative name + "blockNumber", // Common alternative name ]; pub(in crate::amp) static BLOCK_HASH: &[&str] = &[ "hash", // Standard column in some raw tables "block_hash", // Standard column in most raw tables and common alternative name + "blockHash", // Common alternative name ]; pub(in crate::amp) static BLOCK_TIMESTAMP: &[&str] = &[ "timestamp", // Standard column in most raw tables "block_timestamp", // Common alternative name + "blockTimestamp", // Common alternative name ]; } From 3e71ed342d9628b01f738239a927e4fbe2900b0d Mon Sep 17 00:00:00 2001 From: Ion Suman <47307091+isum@users.noreply.github.com> Date: Thu, 6 Nov 2025 14:58:17 +0200 Subject: [PATCH 27/40] fix(core, graph): use named streams in the stream aggregator --- core/src/amp_subgraph/runner/data_stream.rs | 3 +- graph/src/amp/stream_aggregator/error.rs | 14 ++-- graph/src/amp/stream_aggregator/mod.rs | 85 +++++++++++++-------- 3 files changed, 64 insertions(+), 38 deletions(-) diff --git a/core/src/amp_subgraph/runner/data_stream.rs b/core/src/amp_subgraph/runner/data_stream.rs index ec532f52adc..c96aa0b1d05 100644 --- a/core/src/amp_subgraph/runner/data_stream.rs +++ b/core/src/amp_subgraph/runner/data_stream.rs @@ -71,8 +71,9 @@ where for (j, table) in data_source.transformer.tables.iter().enumerate() { let query = table.query.build_with_block_range(block_range); + let stream_name = format!("{}.{}", data_source.name, table.name); - query_streams.push(cx.client.query(&cx.logger, query, None)); + query_streams.push((stream_name, cx.client.query(&cx.logger, query, None))); query_streams_table_ptr.push((i, j)); } } diff --git a/graph/src/amp/stream_aggregator/error.rs b/graph/src/amp/stream_aggregator/error.rs index b58f3e24799..a2ba55f71e2 100644 --- a/graph/src/amp/stream_aggregator/error.rs +++ b/graph/src/amp/stream_aggregator/error.rs @@ -1,3 +1,5 @@ +use std::sync::Arc; + use thiserror::Error; use crate::amp::error::IsDeterministic; @@ -7,29 +9,29 @@ pub enum Error { #[error("failed to aggregate record batches: {0:#}")] Aggregation(#[source] anyhow::Error), - #[error("failed to buffer record batches from stream {stream_index}: {source:#}")] + #[error("failed to buffer record batches from stream '{stream_name}': {source:#}")] Buffer { - stream_index: usize, + stream_name: Arc, source: anyhow::Error, }, - #[error("failed to read record batch from stream {stream_index}: {source:#}")] + #[error("failed to read record batch from stream '{stream_name}': {source:#}")] Stream { - stream_index: usize, + stream_name: Arc, source: anyhow::Error, is_deterministic: bool, }, } impl Error { - pub(super) fn stream(stream_index: usize, e: E) -> Self + pub(super) fn stream(stream_name: Arc, e: E) -> Self where E: std::error::Error + IsDeterministic + Send + Sync + 'static, { let is_deterministic = e.is_deterministic(); Self::Stream { - stream_index, + stream_name, source: anyhow::Error::from(e), is_deterministic, } diff --git a/graph/src/amp/stream_aggregator/mod.rs b/graph/src/amp/stream_aggregator/mod.rs index 1f5c558b77c..e2f0892252f 100644 --- a/graph/src/amp/stream_aggregator/mod.rs +++ b/graph/src/amp/stream_aggregator/mod.rs @@ -3,6 +3,7 @@ mod record_batch; use std::{ pin::Pin, + sync::Arc, task::{self, Poll}, }; @@ -12,7 +13,10 @@ use futures03::{stream::BoxStream, Stream, StreamExt, TryStreamExt}; use slog::{debug, info, Logger}; use self::record_batch::Buffer; -use crate::amp::{client::ResponseBatch, error::IsDeterministic, log::Logger as _}; +use crate::{ + amp::{client::ResponseBatch, error::IsDeterministic, log::Logger as _}, + cheap_clone::CheapClone, +}; pub use self::{ error::Error, @@ -38,10 +42,16 @@ pub use self::{ /// To ensure data consistency and ordered output, the aggregator waits for slower streams /// to catch up with faster streams. The output stream speed matches the slowest input stream. pub struct StreamAggregator { - streams: Vec>>, + named_streams: Vec<(Arc, BoxStream<'static, Result>)>, buffer: Buffer, logger: Logger, + + /// Indicates whether all streams are fully consumed. is_finalized: bool, + + /// Indicates whether any stream has produced an error. + /// + /// When `true`, the stream aggregator stops polling all other streams. is_failed: bool, } @@ -49,7 +59,7 @@ impl StreamAggregator { /// Creates a new stream aggregator from the `streams` with a bounded buffer. pub fn new( logger: &Logger, - streams: impl IntoIterator>>, + named_streams: impl IntoIterator>)>, max_buffer_size: usize, ) -> Self where @@ -57,27 +67,39 @@ impl StreamAggregator { { let logger = logger.component("AmpStreamAggregator"); - let streams = streams + let named_streams = named_streams .into_iter() - .enumerate() - .map(|(stream_index, stream)| { - stream - .map_err(move |e| Error::stream(stream_index, e)) - .try_filter_map(move |response_batch| async move { - match response_batch { - ResponseBatch::Batch { data } => Ok(Some(data)), - ResponseBatch::Reorg(_) => Err(Error::Stream { - stream_index, - source: anyhow!("chain reorg"), - is_deterministic: false, - }), - } - }) - .boxed() + .map(|(stream_name, stream)| { + let stream_name: Arc = stream_name.into(); + ( + stream_name.cheap_clone(), + stream + .map_err({ + let stream_name = stream_name.cheap_clone(); + move |e| Error::stream(stream_name.cheap_clone(), e) + }) + .try_filter_map({ + let stream_name = stream_name.cheap_clone(); + move |response_batch| { + let stream_name = stream_name.cheap_clone(); + async move { + match response_batch { + ResponseBatch::Batch { data } => Ok(Some(data)), + ResponseBatch::Reorg(_) => Err(Error::Stream { + stream_name: stream_name.cheap_clone(), + source: anyhow!("chain reorg"), + is_deterministic: false, + }), + } + } + } + }) + .boxed(), + ) }) .collect::>(); - let num_streams = streams.len(); + let num_streams = named_streams.len(); info!(logger, "Initializing stream aggregator"; "num_streams" => num_streams, @@ -85,7 +107,7 @@ impl StreamAggregator { ); Self { - streams, + named_streams, buffer: Buffer::new(num_streams, max_buffer_size), logger, is_finalized: false, @@ -99,7 +121,12 @@ impl StreamAggregator { ) -> Poll>> { let mut made_progress = false; - for (stream_index, stream) in self.streams.iter_mut().enumerate() { + for (stream_index, (stream_name, stream)) in self.named_streams.iter_mut().enumerate() { + let logger = self.logger.new(slog::o!( + "stream_index" => stream_index, + "stream_name" => stream_name.cheap_clone() + )); + if self.buffer.is_finalized(stream_index) { continue; } @@ -108,7 +135,7 @@ impl StreamAggregator { self.is_failed = true; return Poll::Ready(Some(Err(Error::Buffer { - stream_index, + stream_name: stream_name.cheap_clone(), source: anyhow!("buffer is blocked"), }))); } @@ -123,7 +150,7 @@ impl StreamAggregator { self.buffer .extend(stream_index, record_batch) .map_err(|e| Error::Buffer { - stream_index, + stream_name: stream_name.cheap_clone(), source: e, }); @@ -131,8 +158,7 @@ impl StreamAggregator { Ok(()) => { made_progress = true; - debug!(self.logger, "Buffered record batch"; - "stream_index" => stream_index, + debug!(logger, "Buffered record batch"; "buffer_size" => self.buffer.size(stream_index), "has_capacity" => self.buffer.has_capacity(stream_index) ); @@ -145,9 +171,7 @@ impl StreamAggregator { } } Poll::Ready(Some(Ok(_empty_record_batch))) => { - debug!(self.logger, "Received an empty record batch"; - "stream_index" => stream_index - ); + debug!(logger, "Received an empty record batch"); } Poll::Ready(Some(Err(e))) => { self.is_failed = true; @@ -163,8 +187,7 @@ impl StreamAggregator { made_progress = true; - info!(self.logger, "Stream completed"; - "stream_index" => stream_index, + info!(logger, "Stream completed"; "buffer_size" => self.buffer.size(stream_index) ); } From a02db82f3843a8014ce03ea74f1f144529f004aa Mon Sep 17 00:00:00 2001 From: Ion Suman <47307091+isum@users.noreply.github.com> Date: Thu, 6 Nov 2025 15:47:12 +0200 Subject: [PATCH 28/40] fix(core, graph): simplify working with identifiers --- core/src/amp_subgraph/runner/latest_blocks.rs | 26 ++- core/src/amp_subgraph/runner/reorg_handler.rs | 5 +- graph/src/amp/codec/mod.rs | 14 +- graph/src/amp/codec/name_cache.rs | 22 +-- graph/src/amp/common/ident.rs | 174 ------------------ graph/src/amp/common/mod.rs | 4 - graph/src/amp/manifest/data_source/mod.rs | 15 +- graph/src/amp/manifest/data_source/raw.rs | 69 ++++--- graph/src/amp/schema/generator/entity.rs | 42 +++-- graph/src/amp/schema/generator/mod.rs | 36 ++-- graph/src/data/subgraph/mod.rs | 2 +- 11 files changed, 109 insertions(+), 300 deletions(-) delete mode 100644 graph/src/amp/common/ident.rs diff --git a/core/src/amp_subgraph/runner/latest_blocks.rs b/core/src/amp_subgraph/runner/latest_blocks.rs index e0fc7b5e1b4..359d89aee07 100644 --- a/core/src/amp_subgraph/runner/latest_blocks.rs +++ b/core/src/amp_subgraph/runner/latest_blocks.rs @@ -2,16 +2,12 @@ use alloy::primitives::BlockNumber; use anyhow::anyhow; use arrow::array::RecordBatch; use futures::{future::try_join_all, stream::BoxStream, StreamExt, TryFutureExt}; -use graph::{ - amp::{ - client::ResponseBatch, - codec::{utils::block_number_decoder, Decoder}, - common::Ident, - error::IsDeterministic, - manifest::DataSource, - Client, - }, - cheap_clone::CheapClone, +use graph::amp::{ + client::ResponseBatch, + codec::{utils::block_number_decoder, Decoder}, + error::IsDeterministic, + manifest::DataSource, + Client, }; use itertools::Itertools; use slog::debug; @@ -43,7 +39,7 @@ impl LatestBlocks { .map(move |(j, table)| ((i, j), &data_source.source.dataset, table)) }) .flatten() - .unique_by(|(_, dataset, table)| (dataset.cheap_clone(), table.cheap_clone())) + .unique_by(|(_, dataset, table)| (dataset.to_string(), table.to_string())) .map(|(table_ptr, dataset, table)| { latest_block(&cx, dataset, table) .map_ok(move |latest_block| (table_ptr, latest_block)) @@ -127,8 +123,8 @@ fn indexing_completed(data_source: &DataSource, latest_synced_block: &Option( cx: &Context, - dataset: &Ident, - table: &Ident, + dataset: &str, + table: &str, ) -> Result where AC: Client, @@ -148,8 +144,8 @@ where async fn latest_block_changed( cx: &Context, - dataset: &Ident, - table: &Ident, + dataset: &str, + table: &str, latest_block: BlockNumber, ) -> Result<(), Error> where diff --git a/core/src/amp_subgraph/runner/reorg_handler.rs b/core/src/amp_subgraph/runner/reorg_handler.rs index 5e6fb6beff7..e512d536fac 100644 --- a/core/src/amp_subgraph/runner/reorg_handler.rs +++ b/core/src/amp_subgraph/runner/reorg_handler.rs @@ -4,7 +4,6 @@ use futures::{future::try_join_all, StreamExt, TryFutureExt}; use graph::{ amp::{ client::{LatestBlockBeforeReorg, RequestMetadata, ResponseBatch, ResumeStreamingQuery}, - common::Ident, Client, }, blockchain::block_stream::FirehoseCursor, @@ -124,8 +123,8 @@ where async fn detect_reorg( cx: &Context, network: &str, - dataset: &Ident, - table: &Ident, + dataset: &str, + table: &str, latest_synced_block_number: BlockNumber, latest_synced_block_hash: BlockHash, ) -> Result, Error> diff --git a/graph/src/amp/codec/mod.rs b/graph/src/amp/codec/mod.rs index b2b016322f6..cea10174cc2 100644 --- a/graph/src/amp/codec/mod.rs +++ b/graph/src/amp/codec/mod.rs @@ -7,14 +7,16 @@ mod value_decoder; pub mod utils; -use std::collections::{BTreeMap, HashMap}; +use std::{ + collections::{BTreeMap, HashMap}, + sync::Arc, +}; use anyhow::{anyhow, bail, Context, Result}; use arrow::array::{Array, RecordBatch}; use self::{list_decoder::ListDecoder, mapping_decoder::MappingDecoder, name_cache::NameCache}; use crate::{ - amp::common::Ident, data::{ graphql::TypeExt, store::{Id, IdType, Value}, @@ -163,7 +165,7 @@ impl Codec { .fields() .into_iter() .zip(record_batch.columns()) - .map(|(field, array)| Ok((self.ident(field.name())?, array.as_ref()))) + .map(|(field, array)| Ok((self.ident(field.name()), array.as_ref()))) .collect::>>()?; let mut value_decoders = BTreeMap::new(); @@ -194,7 +196,7 @@ impl Codec { fn value_decoder<'a>( &mut self, field: &'a Field, - columns: &HashMap, + columns: &HashMap, &'a dyn Array>, ) -> Result + 'a>>> { // VIDs are auto-generated if field.name.eq_ignore_ascii_case("vid") { @@ -206,7 +208,7 @@ impl Codec { return Ok(None); } - let normalized_name = self.ident(&field.name)?; + let normalized_name = self.ident(&field.name); let array = match columns.get(&normalized_name) { Some(&array) => array, None => { @@ -230,7 +232,7 @@ impl Codec { Ok(Some(decoder)) } - fn ident(&mut self, name: impl AsRef) -> Result { + fn ident(&mut self, name: impl AsRef) -> Arc { self.name_cache.ident(name.as_ref()) } } diff --git a/graph/src/amp/codec/name_cache.rs b/graph/src/amp/codec/name_cache.rs index ed8afc79c80..ea4419c75a4 100644 --- a/graph/src/amp/codec/name_cache.rs +++ b/graph/src/amp/codec/name_cache.rs @@ -1,12 +1,12 @@ -use std::collections::HashMap; +use std::{collections::HashMap, sync::Arc}; -use anyhow::Result; +use inflector::Inflector; -use crate::{amp::common::Ident, cheap_clone::CheapClone}; +use crate::cheap_clone::CheapClone; -/// Caches identifiers that are used to match Arrow columns and subgraph entity fields. +/// Normalizes and caches identifiers that are used to match Arrow columns and subgraph entity fields. pub(super) struct NameCache { - cache: HashMap, Ident>, + cache: HashMap, Arc>, } impl NameCache { @@ -17,18 +17,18 @@ impl NameCache { } } - /// Returns the identifier for the given name. + /// Normalizes and returns the identifier for the given name. /// /// If the identifier exists in the cache, returns the cached version. - /// Otherwise, creates a new identifier, caches it, and returns it. - pub(super) fn ident(&mut self, name: &str) -> Result { + /// Otherwise, creates a new normalized identifier, caches it, and returns it. + pub(super) fn ident(&mut self, name: &str) -> Arc { if let Some(ident) = self.cache.get(name) { - return Ok(ident.cheap_clone()); + return ident.cheap_clone(); } - let ident = Ident::new(name)?; + let ident: Arc = name.to_camel_case().into(); self.cache.insert(name.into(), ident.cheap_clone()); - Ok(ident) + ident } } diff --git a/graph/src/amp/common/ident.rs b/graph/src/amp/common/ident.rs deleted file mode 100644 index 456bf7efef8..00000000000 --- a/graph/src/amp/common/ident.rs +++ /dev/null @@ -1,174 +0,0 @@ -use std::{ - cmp::Ordering, - fmt, - hash::{Hash, Hasher}, - sync::Arc, -}; - -use anyhow::{bail, Result}; -use heck::{ToLowerCamelCase, ToSnakeCase, ToUpperCamelCase}; -use lazy_regex::regex_is_match; - -use crate::derive::CheapClone; - -/// Represents a valid identifier that can be used for SQL table names, SQL column names, -/// entity names and entity fields. -/// -/// Validates and tokenizes an identifier to allow case-insensitive and format-insensitive -/// comparison between multiple identifiers. -/// -/// Maintains the original identifier for cases when the exact format is required after comparisons. -/// -/// # Example -/// -/// ```rust -/// # use graph::amp::common::Ident; -/// -/// assert_eq!(Ident::new("block_hash").unwrap(), Ident::new("blockHash").unwrap()); -/// assert_eq!(Ident::new("block-hash").unwrap(), Ident::new("BlockHash").unwrap()); -/// ``` -#[derive(Debug, Clone, CheapClone, PartialEq, Eq, PartialOrd, Ord, Hash)] -pub struct Ident(Arc); - -impl Ident { - /// Creates a new identifier. - /// - /// Validates and tokenizes an identifier to allow case-insensitive and format-insensitive - /// comparison between multiple identifiers. - /// - /// # Errors - /// - /// Returns an error if: - /// - The input string `s` does not start with a letter or an underscore - /// - The input string `s` does not contain only letters, numbers, hyphens, and underscores - /// - The input string `s` contains more than 100 characters - /// - /// The returned error is deterministic. - pub fn new(s: impl AsRef) -> Result { - let raw = s.as_ref(); - - if !regex_is_match!("^[a-zA-Z_][a-zA-Z0-9_-]{0,100}$", raw) { - bail!("invalid identifier '{raw}': must start with a letter or an underscore, and contain only letters, numbers, hyphens, and underscores"); - } - - Ok(Self(Arc::new(Inner::new(raw)))) - } - - /// Returns a reference to the original string used to create this identifier. - /// - /// # Example - /// - /// ```rust - /// # use graph::amp::common::Ident; - /// - /// let ident = Ident::new("BLOCK_hash").unwrap(); - /// assert_eq!(ident.as_str(), "BLOCK_hash"); - /// ``` - pub fn as_str(&self) -> &str { - &self.0.raw - } - - /// Returns the tokens of this identifier that are used for case-insensitive and format-insensitive comparison. - /// - /// A token is a sequence of lowercase characters between case format separators. - /// - /// # Example - /// - /// ```rust - /// # use graph::amp::common::Ident; - /// - /// let ident = Ident::new("blockHash").unwrap(); - /// assert_eq!(ident.tokens(), &["block".into(), "hash".into()]); - /// ``` - pub fn tokens(&self) -> &[Box] { - &self.0.tokens - } - - /// Converts this identifier to `lowerCamelCase` format. - /// - /// # Example - /// - /// ```rust - /// # use graph::amp::common::Ident; - /// - /// let ident = Ident::new("block_hash").unwrap(); - /// assert_eq!(ident.to_lower_camel_case(), "blockHash"); - /// ``` - pub fn to_lower_camel_case(&self) -> String { - self.0.raw.to_lower_camel_case() - } - - /// Converts this identifier to `UpperCamelCase` format. - /// - /// # Example - /// - /// ```rust - /// # use graph::amp::common::Ident; - /// - /// let ident = Ident::new("block_hash").unwrap(); - /// assert_eq!(ident.to_upper_camel_case(), "BlockHash"); - /// ``` - pub fn to_upper_camel_case(&self) -> String { - self.0.raw.to_upper_camel_case() - } -} - -impl fmt::Display for Ident { - fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result { - write!(f, "{}", self.0.raw) - } -} - -/// The internal representation of an identifier. -#[derive(Debug)] -struct Inner { - /// The original unmodified string used to create the identifier. - raw: Box, - - /// The tokens of the identifier that are used for case-insensitive - /// and format-insensitive comparison. - tokens: Box<[Box]>, -} - -impl Inner { - /// Creates a new internal representation of an identifier. - fn new(raw: &str) -> Self { - let tokens = raw - .to_snake_case() - .split('_') - .map(Into::into) - .collect::>() - .into(); - - Self { - raw: raw.into(), - tokens, - } - } -} - -impl PartialEq for Inner { - fn eq(&self, other: &Self) -> bool { - self.tokens == other.tokens - } -} - -impl Eq for Inner {} - -impl PartialOrd for Inner { - fn partial_cmp(&self, other: &Self) -> Option { - Some(self.tokens.cmp(&other.tokens)) - } -} - -impl Ord for Inner { - fn cmp(&self, other: &Self) -> Ordering { - self.tokens.cmp(&other.tokens) - } -} - -impl Hash for Inner { - fn hash(&self, state: &mut H) { - self.tokens.hash(state); - } -} diff --git a/graph/src/amp/common/mod.rs b/graph/src/amp/common/mod.rs index 62e44170391..ec2578a9d48 100644 --- a/graph/src/amp/common/mod.rs +++ b/graph/src/amp/common/mod.rs @@ -1,7 +1,3 @@ -mod ident; - -pub use self::ident::Ident; - pub(super) mod column_aliases { pub(in crate::amp) static BLOCK_NUMBER: &[&str] = &[ "_block_num", // Meta column present in all tables diff --git a/graph/src/amp/manifest/data_source/mod.rs b/graph/src/amp/manifest/data_source/mod.rs index 4ebd5af9aca..eb0575fdbf4 100644 --- a/graph/src/amp/manifest/data_source/mod.rs +++ b/graph/src/amp/manifest/data_source/mod.rs @@ -7,10 +7,7 @@ use alloy::{ use arrow::datatypes::Schema; use semver::Version; -use crate::{ - amp::{common::Ident, sql::BlockRangeQueryBuilder}, - data::subgraph::SPEC_VERSION_1_5_0, -}; +use crate::{amp::sql::BlockRangeQueryBuilder, data::subgraph::SPEC_VERSION_1_5_0}; pub use self::raw::RawDataSource; @@ -22,7 +19,7 @@ pub struct DataSource { /// The name of the data source. /// /// Used for observability to identify progress and errors produced by this data source. - pub name: Ident, + pub name: String, /// The network name of the data source. pub network: String, @@ -43,10 +40,10 @@ impl DataSource { #[derive(Debug, Clone)] pub struct Source { /// The dataset from which SQL queries in the data source can query. - pub dataset: Ident, + pub dataset: String, /// The tables from which SQL queries in the data source can query. - pub tables: Vec, + pub tables: Vec, /// The contract address with which SQL queries in the data source interact. /// @@ -89,7 +86,7 @@ pub struct Transformer { #[derive(Debug, Clone)] pub struct Abi { /// The name of the contract. - pub name: Ident, + pub name: String, /// The JSON ABI of the contract. pub contract: JsonAbi, @@ -101,7 +98,7 @@ pub struct Table { /// The name of the transformed table. /// /// Must reference a valid entity name from the subgraph schema. - pub name: Ident, + pub name: String, /// The SQL query that executes on the Amp server. /// diff --git a/graph/src/amp/manifest/data_source/raw.rs b/graph/src/amp/manifest/data_source/raw.rs index 3e6efe836ad..d0663471024 100644 --- a/graph/src/amp/manifest/data_source/raw.rs +++ b/graph/src/amp/manifest/data_source/raw.rs @@ -7,6 +7,7 @@ use alloy::{ use anyhow::anyhow; use arrow::{array::RecordBatch, datatypes::Schema}; use futures03::future::try_join_all; +use lazy_regex::regex_is_match; use semver::Version; use serde::Deserialize; use slog::{debug, error, Logger}; @@ -19,7 +20,6 @@ use crate::{ codec::utils::{ auto_block_hash_decoder, auto_block_number_decoder, auto_block_timestamp_decoder, }, - common::Ident, error::IsDeterministic, sql::{BlockRangeQueryBuilder, ContextQuery, ValidQuery}, }, @@ -76,8 +76,8 @@ impl RawDataSource { let logger = logger.new(slog::o!("data_source" => name.clone())); debug!(logger, "Resolving data source"); - let name = Self::resolve_name(name)?; - Self::resolve_kind(kind)?; + validate_ident(&name).map_err(|e| e.source_context("invalid `name`"))?; + Self::validate_kind(kind)?; let source = source .resolve() @@ -96,11 +96,7 @@ impl RawDataSource { }) } - fn resolve_name(name: String) -> Result { - Ident::new(name).map_err(|e| Error::InvalidValue(e.context("invalid `name`"))) - } - - fn resolve_kind(kind: String) -> Result<(), Error> { + fn validate_kind(kind: String) -> Result<(), Error> { if !kind.eq_ignore_ascii_case(DataSource::KIND) { return Err(Error::InvalidValue(anyhow!("invalid `kind`"))); } @@ -149,8 +145,9 @@ impl RawSource { end_block, } = self; - let dataset = Self::resolve_dataset(dataset)?; - let tables = Self::resolve_tables(tables)?; + validate_ident(&dataset).map_err(|e| e.source_context("invalid `dataset`"))?; + Self::validate_tables(&tables)?; + let address = address.unwrap_or(Address::ZERO); let start_block = start_block.unwrap_or(BlockNumber::MIN); let end_block = end_block.unwrap_or(BlockNumber::MAX); @@ -170,11 +167,7 @@ impl RawSource { }) } - fn resolve_dataset(dataset: String) -> Result { - Ident::new(dataset).map_err(|e| Error::InvalidValue(e.context("invalid `dataset`"))) - } - - fn resolve_tables(tables: Vec) -> Result, Error> { + fn validate_tables(tables: &[String]) -> Result<(), Error> { const MAX_TABLES: usize = 100; if tables.is_empty() { @@ -187,15 +180,12 @@ impl RawSource { ))); } - tables - .into_iter() - .enumerate() - .map(|(i, table)| { - Ident::new(table).map_err(|e| { - Error::InvalidValue(e.context(format!("invalid `tables` at index {i}"))) - }) - }) - .collect() + for (i, table) in tables.iter().enumerate() { + validate_ident(table) + .map_err(|e| e.source_context(format!("invalid `table` at index {i}")))?; + } + + Ok(()) } } @@ -234,7 +224,8 @@ impl RawTransformer { abis, tables, } = self; - let api_version = Self::resolve_api_version(api_version)?; + Self::validate_api_version(&api_version)?; + let abis = Self::resolve_abis(logger, link_resolver, abis).await?; let tables = Self::resolve_tables(logger, link_resolver, amp_client, tables, source, &abis).await?; @@ -246,12 +237,12 @@ impl RawTransformer { }) } - fn resolve_api_version(api_version: Version) -> Result { - if !API_VERSIONS.contains(&api_version) { + fn validate_api_version(api_version: &Version) -> Result<(), Error> { + if !API_VERSIONS.contains(api_version) { return Err(Error::InvalidValue(anyhow!("invalid `api_version`"))); } - Ok(api_version) + Ok(()) } async fn resolve_abis( @@ -342,16 +333,13 @@ impl RawAbi { link_resolver: &dyn LinkResolver, ) -> Result { let Self { name, file } = self; - let name = Self::resolve_name(name)?; + + validate_ident(&name).map_err(|e| e.source_context("invalid `name`"))?; let contract = Self::resolve_contract(logger, link_resolver, file).await?; Ok(Abi { name, contract }) } - fn resolve_name(name: String) -> Result { - Ident::new(name).map_err(|e| Error::InvalidValue(e.context("invalid `name`"))) - } - async fn resolve_contract( logger: &Logger, link_resolver: &dyn LinkResolver, @@ -412,7 +400,7 @@ impl RawTable { ) -> Result { let Self { name, query, file } = self; - let name = Self::resolve_name(name)?; + validate_ident(&name).map_err(|e| e.source_context("invalid `name`"))?; let query = match Self::resolve_query(query, source, abis)? { Some(query) => query, None => Self::resolve_file(logger, link_resolver, file, source, abis).await?, @@ -436,10 +424,6 @@ impl RawTable { }) } - fn resolve_name(name: String) -> Result { - Ident::new(name).map_err(|e| Error::InvalidValue(e.context("invalid `name`"))) - } - fn resolve_query( query: Option, source: &Source, @@ -666,3 +650,12 @@ impl IsDeterministic for Error { } } } + +fn validate_ident(s: &str) -> Result<(), Error> { + if !regex_is_match!("^[a-zA-Z_][a-zA-Z0-9_-]{0,100}$", s) { + return Err(Error::InvalidValue( + anyhow!("invalid identifier '{s}': must start with a letter or an underscore, and contain only letters, numbers, hyphens, and underscores") + )); + } + Ok(()) +} diff --git a/graph/src/amp/schema/generator/entity.rs b/graph/src/amp/schema/generator/entity.rs index 0770f1c6214..7e3fa5b8f6c 100644 --- a/graph/src/amp/schema/generator/entity.rs +++ b/graph/src/amp/schema/generator/entity.rs @@ -1,16 +1,17 @@ use std::fmt; use anyhow::{bail, Context, Result}; +use inflector::Inflector; -use crate::{amp::common::Ident, cheap_clone::CheapClone, data::store::ValueType}; +use crate::data::store::ValueType; /// A minimal representation of a subgraph entity. -pub(super) struct Entity { - name: Ident, - fields: Vec, +pub(super) struct SchemaEntity { + name: String, + fields: Vec, } -impl Entity { +impl SchemaEntity { /// Converts the Arrow schema to a subgraph entity. /// /// # Errors @@ -18,29 +19,32 @@ impl Entity { /// Returns an error if Arrow fields cannot be converted to subgraph entity fields. /// /// The returned error is deterministic. - pub(super) fn new(name: Ident, arrow_schema: arrow::datatypes::Schema) -> Result { + pub(super) fn new(name: String, arrow_schema: arrow::datatypes::Schema) -> Result { let mut fields = arrow_schema .fields() .iter() .map(|field| { - Field::new(field) + SchemaField::new(field) .with_context(|| format!("failed to create field '{}'", field.name())) }) .collect::, _>>()?; - if !fields.iter().any(|field| field.name.as_str() == "id") { - fields.push(Field::id()); + if !fields + .iter() + .any(|field| field.name.as_str().eq_ignore_ascii_case("id")) + { + fields.push(SchemaField::id()); } - fields.sort_unstable_by_key(|field| field.name.cheap_clone()); + fields.sort_unstable_by_key(|field| field.name.clone()); Ok(Self { name, fields }) } } -impl fmt::Display for Entity { +impl fmt::Display for SchemaEntity { fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result { - write! {f, "type {} @entity(immutable: true)", self.name.to_upper_camel_case()}?; + write! {f, "type {} @entity(immutable: true)", self.name.to_pascal_case()}?; write! {f, " {{\n"}?; for field in &self.fields { write! {f, "\t{field}\n"}?; @@ -50,14 +54,14 @@ impl fmt::Display for Entity { } /// A minimal representation of a subgraph entity field. -struct Field { - name: Ident, +struct SchemaField { + name: String, value_type: ValueType, is_list: bool, is_required: bool, } -impl Field { +impl SchemaField { /// Converts the Arrow field to a subgraph entity field. /// /// # Errors @@ -68,7 +72,7 @@ impl Field { /// /// The returned error is deterministic. fn new(arrow_field: &arrow::datatypes::Field) -> Result { - let name = Ident::new(arrow_field.name())?; + let name = arrow_field.name().to_string(); let (value_type, is_list) = arrow_data_type_to_value_type(arrow_field.data_type())?; let is_required = !arrow_field.is_nullable(); @@ -83,7 +87,7 @@ impl Field { /// Creates an `ID` subgraph entity field. fn id() -> Self { Self { - name: Ident::new("id").unwrap(), + name: "id".to_string(), value_type: ValueType::Bytes, is_list: false, is_required: true, @@ -91,9 +95,9 @@ impl Field { } } -impl fmt::Display for Field { +impl fmt::Display for SchemaField { fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result { - write! {f, "{}: ", self.name.to_lower_camel_case()}?; + write! {f, "{}: ", self.name.to_camel_case()}?; if self.is_list { write! {f, "["}?; } diff --git a/graph/src/amp/schema/generator/mod.rs b/graph/src/amp/schema/generator/mod.rs index 8ffd673cf6d..117d710adbe 100644 --- a/graph/src/amp/schema/generator/mod.rs +++ b/graph/src/amp/schema/generator/mod.rs @@ -1,14 +1,10 @@ mod entity; use anyhow::{Context, Result}; -use arrow::datatypes::Schema; use itertools::Itertools; -use self::entity::Entity; -use crate::{ - amp::common::Ident, cheap_clone::CheapClone, data::subgraph::DeploymentHash, - schema::InputSchema, -}; +use self::entity::SchemaEntity; +use crate::{data::subgraph::DeploymentHash, schema::InputSchema}; /// Generates a subgraph schema from a list of Arrow schemas. /// @@ -23,12 +19,12 @@ use crate::{ /// The returned error is deterministic. pub fn generate_subgraph_schema( deployment_hash: &DeploymentHash, - queries: impl IntoIterator, + named_schemas: impl IntoIterator, ) -> Result { - let mut queries = merge_related_queries(queries)?; - queries.sort_unstable_by_key(|(name, _)| name.cheap_clone()); + let mut named_schemas = merge_related_schemas(named_schemas)?; + named_schemas.sort_unstable_by_key(|(name, _)| name.clone()); - let entities = create_entities(queries)?; + let entities = create_entities(named_schemas)?; let mut subgraph_schema = String::new(); for entity in entities { @@ -42,27 +38,27 @@ pub fn generate_subgraph_schema( Ok(input_schema) } -fn merge_related_queries( - queries: impl IntoIterator, -) -> Result> { - queries +fn merge_related_schemas( + named_schemas: impl IntoIterator, +) -> Result> { + named_schemas .into_iter() - .into_group_map_by(|(name, _)| name.cheap_clone()) + .into_group_map_by(|(name, _)| name.clone()) .into_iter() - .map(|(name, related_queries)| { - let related_schemas = related_queries.into_iter().map(|(_, schema)| schema); + .map(|(name, related_schemas)| { + let related_schemas = related_schemas.into_iter().map(|(_, schema)| schema); - Schema::try_merge(related_schemas).map(|schema| (name, schema)) + arrow::datatypes::Schema::try_merge(related_schemas).map(|schema| (name, schema)) }) .collect::, _>>() .context("failed to merge schemas of related SQL queries") } -fn create_entities(queries: Vec<(Ident, Schema)>) -> Result> { +fn create_entities(queries: Vec<(String, arrow::datatypes::Schema)>) -> Result> { queries .into_iter() .map(|(name, schema)| { - Entity::new(name.cheap_clone(), schema) + SchemaEntity::new(name.clone(), schema) .with_context(|| format!("failed to create entity '{}'", name)) }) .collect::, _>>() diff --git a/graph/src/data/subgraph/mod.rs b/graph/src/data/subgraph/mod.rs index c9212d7a639..6d893be55cc 100644 --- a/graph/src/data/subgraph/mod.rs +++ b/graph/src/data/subgraph/mod.rs @@ -1183,7 +1183,7 @@ impl UnresolvedSubgraphManifest { .transformer .tables .iter() - .map(|table| (table.name.cheap_clone(), table.schema.clone())) + .map(|table| (table.name.clone(), table.schema.clone())) }) .flatten(); From 74c9357ac9eb55902ed6167825f83fdeec4f13ad Mon Sep 17 00:00:00 2001 From: Ion Suman <47307091+isum@users.noreply.github.com> Date: Thu, 6 Nov 2025 15:56:47 +0200 Subject: [PATCH 29/40] fix(graph): validate query output column names --- graph/src/amp/manifest/data_source/raw.rs | 10 ++++++++++ 1 file changed, 10 insertions(+) diff --git a/graph/src/amp/manifest/data_source/raw.rs b/graph/src/amp/manifest/data_source/raw.rs index d0663471024..38eda81b137 100644 --- a/graph/src/amp/manifest/data_source/raw.rs +++ b/graph/src/amp/manifest/data_source/raw.rs @@ -408,6 +408,16 @@ impl RawTable { debug!(logger, "Resolving query schema"); let schema = Self::resolve_schema(logger, amp_client, &query).await?; + + for field in schema.fields() { + validate_ident(field.name()).map_err(|e| { + e.source_context(format!( + "invalid query output schema: invalid column '{}'", + field.name() + )) + })?; + } + let block_range_query_builder = Self::resolve_block_range_query_builder( logger, amp_client, From cd8f962c497e887085f195da1cbd561f8909b885 Mon Sep 17 00:00:00 2001 From: Ion Suman <47307091+isum@users.noreply.github.com> Date: Thu, 6 Nov 2025 16:01:42 +0200 Subject: [PATCH 30/40] fix(graph): support all versions of the Amp server --- graph/src/amp/client/flight_client.rs | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/graph/src/amp/client/flight_client.rs b/graph/src/amp/client/flight_client.rs index 6b5c14ce94f..d8a4f154632 100644 --- a/graph/src/amp/client/flight_client.rs +++ b/graph/src/amp/client/flight_client.rs @@ -134,7 +134,9 @@ impl Client for FlightClient { "amp-resume" => &metadata ); - // TODO: Update the header name when the Amp server updates to the latest version + raw_client.set_header("amp-resume", metadata.clone()); + + // TODO: Remove when the Amp server updates to the latest version raw_client.set_header("nozzle-resume", metadata); } } From c3fcb3bb6a834c4d9fb3ab167458de24db904367 Mon Sep 17 00:00:00 2001 From: Ion Suman <47307091+isum@users.noreply.github.com> Date: Tue, 11 Nov 2025 16:46:21 +0200 Subject: [PATCH 31/40] fix(graph): extend the list of common column aliases --- graph/src/amp/common/mod.rs | 4 ++++ 1 file changed, 4 insertions(+) diff --git a/graph/src/amp/common/mod.rs b/graph/src/amp/common/mod.rs index ec2578a9d48..d98fbea3b1b 100644 --- a/graph/src/amp/common/mod.rs +++ b/graph/src/amp/common/mod.rs @@ -3,18 +3,22 @@ pub(super) mod column_aliases { "_block_num", // Meta column present in all tables "block_num", // Standard column in most raw tables "blockNum", // Common alternative name + "blocknum", // Common alternative name "block", // Common alternative name "block_number", // Common alternative name "blockNumber", // Common alternative name + "blocknumber", // Common alternative name ]; pub(in crate::amp) static BLOCK_HASH: &[&str] = &[ "hash", // Standard column in some raw tables "block_hash", // Standard column in most raw tables and common alternative name "blockHash", // Common alternative name + "blockhash", // Common alternative name ]; pub(in crate::amp) static BLOCK_TIMESTAMP: &[&str] = &[ "timestamp", // Standard column in most raw tables "block_timestamp", // Common alternative name "blockTimestamp", // Common alternative name + "blocktimestamp", // Common alternative name ]; } From 2c68f4f9eeb3ad6fd924a22a9b62dd3c3654d849 Mon Sep 17 00:00:00 2001 From: Ion Suman <47307091+isum@users.noreply.github.com> Date: Tue, 11 Nov 2025 17:24:15 +0200 Subject: [PATCH 32/40] test(graph): add decoder unit-tests --- Cargo.lock | 2 +- Cargo.toml | 2 +- graph/Cargo.toml | 2 +- graph/src/amp/codec/array_decoder.rs | 1665 ++++++++++++++++++++- graph/src/amp/codec/mod.rs | 273 ++++ graph/src/amp/codec/name_cache.rs | 2 +- graph/src/amp/codec/test_fixtures.rs | 364 +++++ graph/src/amp/codec/value_decoder.rs | 525 ++++++- graph/src/data/store/scalar/bigdecimal.rs | 12 + graph/src/data/store/scalar/bigint.rs | 44 +- graph/src/data/store/scalar/timestamp.rs | 6 + 11 files changed, 2883 insertions(+), 14 deletions(-) create mode 100644 graph/src/amp/codec/test_fixtures.rs diff --git a/Cargo.lock b/Cargo.lock index c2ff2af3129..6dedcee131a 100644 --- a/Cargo.lock +++ b/Cargo.lock @@ -2735,7 +2735,7 @@ dependencies = [ "futures 0.3.31", "graph_derive", "graphql-parser", - "heck 0.5.0", + "half", "hex", "hex-literal 1.0.0", "http 0.2.12", diff --git a/Cargo.toml b/Cargo.toml index a2261f4c2a4..bc7e59156a2 100644 --- a/Cargo.toml +++ b/Cargo.toml @@ -104,7 +104,7 @@ alloy = { version = "1.0.12", default-features = false, features = ["json-abi", arrow = { version = "=55.0.0" } arrow-flight = { version = "=55.0.0", features = ["flight-sql-experimental"] } futures = "0.3.31" -heck = "0.5.0" +half = "2.7.1" lazy-regex = "3.4.1" parking_lot = "0.12.4" sqlparser-latest = { version = "0.57.0", package = "sqlparser", features = ["visitor"] } diff --git a/graph/Cargo.toml b/graph/Cargo.toml index 0df5c43a47b..9c55beb4c28 100644 --- a/graph/Cargo.toml +++ b/graph/Cargo.toml @@ -108,7 +108,7 @@ ahash.workspace = true alloy.workspace = true arrow-flight.workspace = true arrow.workspace = true -heck.workspace = true +half.workspace = true lazy-regex.workspace = true sqlparser-latest.workspace = true tokio-util.workspace = true diff --git a/graph/src/amp/codec/array_decoder.rs b/graph/src/amp/codec/array_decoder.rs index d8bc677472a..e74a777cb12 100644 --- a/graph/src/amp/codec/array_decoder.rs +++ b/graph/src/amp/codec/array_decoder.rs @@ -221,36 +221,48 @@ impl Decoder> for ArrayDecoder<'_, UInt64Array> { impl Decoder> for ArrayDecoder<'_, Float16Array> { fn decode(&self, row_index: usize) -> Result> { - self.value(row_index, |x| Ok(f64::from(x).into())) + self.value(row_index, |value| Ok(value.to_f32().into())) } } impl Decoder> for ArrayDecoder<'_, Float32Array> { fn decode(&self, row_index: usize) -> Result> { - self.value(row_index, |x| Ok(f64::from(x).into())) + self.value(row_index, |value| Ok(value.into())) } } impl Decoder> for ArrayDecoder<'_, Float64Array> { fn decode(&self, row_index: usize) -> Result> { - self.value(row_index, |x| Ok(x.into())) + self.value(row_index, |value| Ok(value.into())) } } impl Decoder> for ArrayDecoder<'_, Decimal128Array> { fn decode(&self, row_index: usize) -> Result> { + if self.0.scale() != 0 { + return Err(anyhow!("cannot decode `i32` from a decimal value")); + } + self.value(row_index, decode_i32) } } impl Decoder> for ArrayDecoder<'_, Decimal128Array> { fn decode(&self, row_index: usize) -> Result> { + if self.0.scale() != 0 { + return Err(anyhow!("cannot decode `i64` from a decimal value")); + } + self.value(row_index, decode_i64) } } impl Decoder> for ArrayDecoder<'_, Decimal128Array> { fn decode(&self, row_index: usize) -> Result> { + if self.0.scale() != 0 { + return Err(anyhow!("cannot decode `BigInt` from a decimal value")); + } + self.value(row_index, |x| decode_signed_big_int(x.to_le_bytes())) } } @@ -268,18 +280,42 @@ impl Decoder> for ArrayDecoder<'_, Decimal128Array> { impl Decoder> for ArrayDecoder<'_, Decimal256Array> { fn decode(&self, row_index: usize) -> Result> { - self.value(row_index, |x| decode_i32(x.as_i128())) + if self.0.scale() != 0 { + return Err(anyhow!("cannot decode `i32` from a decimal value")); + } + + self.value(row_index, |value| { + let value = value + .to_i128() + .ok_or_else(|| anyhow!("cannot decode `i32` from a larger `i256` value"))?; + + decode_i32(value) + }) } } impl Decoder> for ArrayDecoder<'_, Decimal256Array> { fn decode(&self, row_index: usize) -> Result> { - self.value(row_index, |x| decode_i64(x.as_i128())) + if self.0.scale() != 0 { + return Err(anyhow!("cannot decode `i64` from a decimal value")); + } + + self.value(row_index, |value| { + let value = value + .to_i128() + .ok_or_else(|| anyhow!("cannot decode `i64` from a larger `i256` value"))?; + + decode_i64(value) + }) } } impl Decoder> for ArrayDecoder<'_, Decimal256Array> { fn decode(&self, row_index: usize) -> Result> { + if self.0.scale() != 0 { + return Err(anyhow!("cannot decode `BigInt` from a decimal value")); + } + self.value(row_index, |x| decode_signed_big_int(x.to_le_bytes())) } } @@ -301,18 +337,78 @@ impl Decoder> for ArrayDecoder<'_, StringArray> { } } +impl Decoder> for ArrayDecoder<'_, StringArray> { + fn decode(&self, row_index: usize) -> Result> { + self.value(row_index, |value| { + value + .parse() + .map_err(|_| anyhow!("failed to parse `BigInt` from a non-numeric string value")) + }) + } +} + +impl Decoder> for ArrayDecoder<'_, StringArray> { + fn decode(&self, row_index: usize) -> Result> { + self.value(row_index, |value| { + value.parse().map_err(|_| { + anyhow!("failed to parse `BigDecimal` from a non-numeric string value") + }) + }) + } +} + impl Decoder> for ArrayDecoder<'_, StringViewArray> { fn decode(&self, row_index: usize) -> Result> { self.value(row_index, |x| Ok(x.to_string())) } } +impl Decoder> for ArrayDecoder<'_, StringViewArray> { + fn decode(&self, row_index: usize) -> Result> { + self.value(row_index, |value| { + value + .parse() + .map_err(|_| anyhow!("failed to parse `BigInt` from a non-numeric string value")) + }) + } +} + +impl Decoder> for ArrayDecoder<'_, StringViewArray> { + fn decode(&self, row_index: usize) -> Result> { + self.value(row_index, |value| { + value.parse().map_err(|_| { + anyhow!("failed to parse `BigDecimal` from a non-numeric string value") + }) + }) + } +} + impl Decoder> for ArrayDecoder<'_, LargeStringArray> { fn decode(&self, row_index: usize) -> Result> { self.value(row_index, |x| Ok(x.to_string())) } } +impl Decoder> for ArrayDecoder<'_, LargeStringArray> { + fn decode(&self, row_index: usize) -> Result> { + self.value(row_index, |value| { + value + .parse() + .map_err(|_| anyhow!("failed to parse `BigInt` from a non-numeric string value")) + }) + } +} + +impl Decoder> for ArrayDecoder<'_, LargeStringArray> { + fn decode(&self, row_index: usize) -> Result> { + self.value(row_index, |value| { + value.parse().map_err(|_| { + anyhow!("failed to parse `BigDecimal` from a non-numeric string value") + }) + }) + } +} + impl Decoder>> for ArrayDecoder<'_, BinaryArray> { fn decode(&self, row_index: usize) -> Result>> { self.value(row_index, |x| Ok(x.into())) @@ -427,3 +523,1562 @@ where Ok(timestamp.to_utc()) } + +#[cfg(test)] +mod tests { + use arrow::datatypes::i256; + use chrono::TimeZone; + use half::f16; + + use super::super::test_fixtures::*; + use super::*; + + mod boolean_decoder { + use super::*; + + fn decoder() -> Box>> + where + ArrayDecoder<'static, BooleanArray>: Decoder>, + { + Box::new( + ArrayDecoder::::new(RECORD_BATCH.column_by_name("boolean").unwrap()) + .unwrap(), + ) + } + + #[test] + fn decode_valid_values() { + let decoder = decoder::(); + + assert_eq!(decoder.decode(0).unwrap(), Some(true)); + assert_eq!(decoder.decode(1).unwrap(), Some(false)); + assert_eq!(decoder.decode(2).unwrap(), Some(true)); + } + + #[test] + fn handle_missing_values() { + let decoder = decoder::(); + + assert!(decoder.decode(3).unwrap().is_none()); + assert!(decoder.decode(4).unwrap().is_none()); + } + + #[test] + fn fail_to_create_decoder_with_invalid_type() { + ArrayDecoder::::new(RECORD_BATCH.column_by_name("boolean").unwrap()) + .map(|_| ()) + .unwrap_err(); + } + } + + mod int8_decoder { + use super::*; + + fn decoder() -> Box>> + where + ArrayDecoder<'static, Int8Array>: Decoder>, + { + Box::new( + ArrayDecoder::::new(RECORD_BATCH.column_by_name("int8").unwrap()) + .unwrap(), + ) + } + + #[test] + fn decode_valid_i32_values() { + let decoder = decoder::(); + + assert_eq!(decoder.decode(0).unwrap(), Some(10i32)); + assert_eq!(decoder.decode(1).unwrap(), Some(20i32)); + assert_eq!(decoder.decode(2).unwrap(), Some(i8::MAX as i32)); + } + + #[test] + fn decode_valid_i64_values() { + let decoder = decoder::(); + + assert_eq!(decoder.decode(0).unwrap(), Some(10i64)); + assert_eq!(decoder.decode(1).unwrap(), Some(20i64)); + assert_eq!(decoder.decode(2).unwrap(), Some(i8::MAX as i64)); + } + + #[test] + fn decode_valid_big_int_values() { + let decoder = decoder::(); + + assert_eq!(decoder.decode(0).unwrap(), Some(BigInt::from(10))); + assert_eq!(decoder.decode(1).unwrap(), Some(BigInt::from(20))); + assert_eq!(decoder.decode(2).unwrap(), Some(BigInt::from(i8::MAX))); + } + + #[test] + fn handle_missing_values() { + let decoder = decoder::(); + + assert!(decoder.decode(3).unwrap().is_none()); + assert!(decoder.decode(4).unwrap().is_none()); + } + + #[test] + fn fail_to_create_decoder_with_invalid_type() { + ArrayDecoder::::new(RECORD_BATCH.column_by_name("int8").unwrap()) + .map(|_| ()) + .unwrap_err(); + } + } + + mod int16_decoder { + use super::*; + + fn decoder() -> Box>> + where + ArrayDecoder<'static, Int16Array>: Decoder>, + { + Box::new( + ArrayDecoder::::new(RECORD_BATCH.column_by_name("int16").unwrap()) + .unwrap(), + ) + } + + #[test] + fn decode_valid_i32_values() { + let decoder = decoder::(); + + assert_eq!(decoder.decode(0).unwrap(), Some(10i32)); + assert_eq!(decoder.decode(1).unwrap(), Some(20i32)); + assert_eq!(decoder.decode(2).unwrap(), Some(i16::MAX as i32)); + } + + #[test] + fn decode_valid_i64_values() { + let decoder = decoder::(); + + assert_eq!(decoder.decode(0).unwrap(), Some(10i64)); + assert_eq!(decoder.decode(1).unwrap(), Some(20i64)); + assert_eq!(decoder.decode(2).unwrap(), Some(i16::MAX as i64)); + } + + #[test] + fn decode_valid_big_int_values() { + let decoder = decoder::(); + + assert_eq!(decoder.decode(0).unwrap(), Some(BigInt::from(10))); + assert_eq!(decoder.decode(1).unwrap(), Some(BigInt::from(20))); + assert_eq!(decoder.decode(2).unwrap(), Some(BigInt::from(i16::MAX))); + } + + #[test] + fn handle_missing_values() { + let decoder = decoder::(); + + assert!(decoder.decode(3).unwrap().is_none()); + assert!(decoder.decode(4).unwrap().is_none()); + } + + #[test] + fn fail_to_create_decoder_with_invalid_type() { + ArrayDecoder::::new(RECORD_BATCH.column_by_name("int16").unwrap()) + .map(|_| ()) + .unwrap_err(); + } + } + + mod int32_decoder { + use super::*; + + fn decoder() -> Box>> + where + ArrayDecoder<'static, Int32Array>: Decoder>, + { + Box::new( + ArrayDecoder::::new(RECORD_BATCH.column_by_name("int32").unwrap()) + .unwrap(), + ) + } + + #[test] + fn decode_valid_i32_values() { + let decoder = decoder::(); + + assert_eq!(decoder.decode(0).unwrap(), Some(10i32)); + assert_eq!(decoder.decode(1).unwrap(), Some(20i32)); + assert_eq!(decoder.decode(2).unwrap(), Some(i32::MAX)); + } + + #[test] + fn decode_valid_i64_values() { + let decoder = decoder::(); + + assert_eq!(decoder.decode(0).unwrap(), Some(10i64)); + assert_eq!(decoder.decode(1).unwrap(), Some(20i64)); + assert_eq!(decoder.decode(2).unwrap(), Some(i32::MAX as i64)); + } + + #[test] + fn decode_valid_big_int_values() { + let decoder = decoder::(); + + assert_eq!(decoder.decode(0).unwrap(), Some(BigInt::from(10))); + assert_eq!(decoder.decode(1).unwrap(), Some(BigInt::from(20))); + assert_eq!(decoder.decode(2).unwrap(), Some(BigInt::from(i32::MAX))); + } + + #[test] + fn handle_missing_values() { + let decoder = decoder::(); + + assert!(decoder.decode(3).unwrap().is_none()); + assert!(decoder.decode(4).unwrap().is_none()); + } + + #[test] + fn fail_to_create_decoder_with_invalid_type() { + ArrayDecoder::::new(RECORD_BATCH.column_by_name("int32").unwrap()) + .map(|_| ()) + .unwrap_err(); + } + } + + mod int64_decoder { + use super::*; + + fn decoder() -> Box>> + where + ArrayDecoder<'static, Int64Array>: Decoder>, + { + Box::new( + ArrayDecoder::::new(RECORD_BATCH.column_by_name("int64").unwrap()) + .unwrap(), + ) + } + + #[test] + fn decode_valid_i32_values() { + let decoder = decoder::(); + + assert_eq!(decoder.decode(0).unwrap(), Some(10i32)); + assert_eq!(decoder.decode(1).unwrap(), Some(20i32)); + } + + #[test] + fn fail_to_decode_i32_values_from_larger_values() { + let decoder = decoder::(); + + decoder.decode(2).unwrap_err(); + } + + #[test] + fn decode_valid_i64_values() { + let decoder = decoder::(); + + assert_eq!(decoder.decode(0).unwrap(), Some(10i64)); + assert_eq!(decoder.decode(1).unwrap(), Some(20i64)); + assert_eq!(decoder.decode(2).unwrap(), Some(i64::MAX)); + } + + #[test] + fn decode_valid_big_int_values() { + let decoder = decoder::(); + + assert_eq!(decoder.decode(0).unwrap(), Some(BigInt::from(10))); + assert_eq!(decoder.decode(1).unwrap(), Some(BigInt::from(20))); + assert_eq!(decoder.decode(2).unwrap(), Some(BigInt::from(i64::MAX))); + } + + #[test] + fn handle_missing_values() { + let decoder = decoder::(); + + assert!(decoder.decode(3).unwrap().is_none()); + assert!(decoder.decode(4).unwrap().is_none()); + } + + #[test] + fn fail_to_create_decoder_with_invalid_type() { + ArrayDecoder::::new(RECORD_BATCH.column_by_name("int64").unwrap()) + .map(|_| ()) + .unwrap_err(); + } + } + + mod uint8_decoder { + use super::*; + + fn decoder() -> Box>> + where + ArrayDecoder<'static, UInt8Array>: Decoder>, + { + Box::new( + ArrayDecoder::::new(RECORD_BATCH.column_by_name("uint8").unwrap()) + .unwrap(), + ) + } + + #[test] + fn decode_valid_i32_values() { + let decoder = decoder::(); + + assert_eq!(decoder.decode(0).unwrap(), Some(10i32)); + assert_eq!(decoder.decode(1).unwrap(), Some(20i32)); + assert_eq!(decoder.decode(2).unwrap(), Some(u8::MAX as i32)); + } + + #[test] + fn decode_valid_i64_values() { + let decoder = decoder::(); + + assert_eq!(decoder.decode(0).unwrap(), Some(10i64)); + assert_eq!(decoder.decode(1).unwrap(), Some(20i64)); + assert_eq!(decoder.decode(2).unwrap(), Some(u8::MAX as i64)); + } + + #[test] + fn decode_valid_big_int_values() { + let decoder = decoder::(); + + assert_eq!(decoder.decode(0).unwrap(), Some(BigInt::from(10))); + assert_eq!(decoder.decode(1).unwrap(), Some(BigInt::from(20))); + assert_eq!(decoder.decode(2).unwrap(), Some(BigInt::from(u8::MAX))); + } + + #[test] + fn handle_missing_values() { + let decoder = decoder::(); + + assert!(decoder.decode(3).unwrap().is_none()); + assert!(decoder.decode(4).unwrap().is_none()); + } + + #[test] + fn fail_to_create_decoder_with_invalid_type() { + ArrayDecoder::::new(RECORD_BATCH.column_by_name("uint8").unwrap()) + .map(|_| ()) + .unwrap_err(); + } + } + + mod uint16_decoder { + use super::*; + + fn decoder() -> Box>> + where + ArrayDecoder<'static, UInt16Array>: Decoder>, + { + Box::new( + ArrayDecoder::::new(RECORD_BATCH.column_by_name("uint16").unwrap()) + .unwrap(), + ) + } + + #[test] + fn decode_valid_i32_values() { + let decoder = decoder::(); + + assert_eq!(decoder.decode(0).unwrap(), Some(10i32)); + assert_eq!(decoder.decode(1).unwrap(), Some(20i32)); + assert_eq!(decoder.decode(2).unwrap(), Some(u16::MAX as i32)); + } + + #[test] + fn decode_valid_i64_values() { + let decoder = decoder::(); + + assert_eq!(decoder.decode(0).unwrap(), Some(10i64)); + assert_eq!(decoder.decode(1).unwrap(), Some(20i64)); + assert_eq!(decoder.decode(2).unwrap(), Some(u16::MAX as i64)); + } + + #[test] + fn decode_valid_big_int_values() { + let decoder = decoder::(); + + assert_eq!(decoder.decode(0).unwrap(), Some(BigInt::from(10))); + assert_eq!(decoder.decode(1).unwrap(), Some(BigInt::from(20))); + assert_eq!(decoder.decode(2).unwrap(), Some(BigInt::from(u16::MAX))); + } + + #[test] + fn handle_missing_values() { + let decoder = decoder::(); + + assert!(decoder.decode(3).unwrap().is_none()); + assert!(decoder.decode(4).unwrap().is_none()); + } + + #[test] + fn fail_to_create_decoder_with_invalid_type() { + ArrayDecoder::::new(RECORD_BATCH.column_by_name("uint16").unwrap()) + .map(|_| ()) + .unwrap_err(); + } + } + + mod uint32_decoder { + use super::*; + + fn decoder() -> Box>> + where + ArrayDecoder<'static, UInt32Array>: Decoder>, + { + Box::new( + ArrayDecoder::::new(RECORD_BATCH.column_by_name("uint32").unwrap()) + .unwrap(), + ) + } + + #[test] + fn decode_valid_i32_values() { + let decoder = decoder::(); + + assert_eq!(decoder.decode(0).unwrap(), Some(10i32)); + assert_eq!(decoder.decode(1).unwrap(), Some(20i32)); + } + + #[test] + fn fail_to_decode_i32_values_from_larger_values() { + let decoder = decoder::(); + + decoder.decode(2).unwrap_err(); + } + + #[test] + fn decode_valid_i64_values() { + let decoder = decoder::(); + + assert_eq!(decoder.decode(0).unwrap(), Some(10i64)); + assert_eq!(decoder.decode(1).unwrap(), Some(20i64)); + assert_eq!(decoder.decode(2).unwrap(), Some(u32::MAX as i64)); + } + + #[test] + fn decode_valid_big_int_values() { + let decoder = decoder::(); + + assert_eq!(decoder.decode(0).unwrap(), Some(BigInt::from(10))); + assert_eq!(decoder.decode(1).unwrap(), Some(BigInt::from(20))); + assert_eq!(decoder.decode(2).unwrap(), Some(BigInt::from(u32::MAX))); + } + + #[test] + fn handle_missing_values() { + let decoder = decoder::(); + + assert!(decoder.decode(3).unwrap().is_none()); + assert!(decoder.decode(4).unwrap().is_none()); + } + + #[test] + fn fail_to_create_decoder_with_invalid_type() { + ArrayDecoder::::new(RECORD_BATCH.column_by_name("uint32").unwrap()) + .map(|_| ()) + .unwrap_err(); + } + } + + mod uint64_decoder { + use super::*; + + fn decoder() -> Box>> + where + ArrayDecoder<'static, UInt64Array>: Decoder>, + { + Box::new( + ArrayDecoder::::new(RECORD_BATCH.column_by_name("uint64").unwrap()) + .unwrap(), + ) + } + + #[test] + fn decode_valid_i32_values() { + let decoder = decoder::(); + + assert_eq!(decoder.decode(0).unwrap(), Some(10i32)); + assert_eq!(decoder.decode(1).unwrap(), Some(20i32)); + } + + #[test] + fn fail_to_decode_i32_values_from_larger_values() { + let decoder = decoder::(); + + decoder.decode(2).unwrap_err(); + } + + #[test] + fn decode_valid_i64_values() { + let decoder = decoder::(); + + assert_eq!(decoder.decode(0).unwrap(), Some(10i64)); + assert_eq!(decoder.decode(1).unwrap(), Some(20i64)); + } + + #[test] + fn fail_to_decode_i64_values_from_larger_values() { + let decoder = decoder::(); + + decoder.decode(2).unwrap_err(); + } + + #[test] + fn decode_valid_big_int_values() { + let decoder = decoder::(); + + assert_eq!(decoder.decode(0).unwrap(), Some(BigInt::from(10))); + assert_eq!(decoder.decode(1).unwrap(), Some(BigInt::from(20))); + assert_eq!(decoder.decode(2).unwrap(), Some(BigInt::from(u64::MAX))); + } + + #[test] + fn decode_valid_u64_values() { + let decoder = decoder::(); + + assert_eq!(decoder.decode(0).unwrap(), Some(10u64)); + assert_eq!(decoder.decode(1).unwrap(), Some(20u64)); + assert_eq!(decoder.decode(2).unwrap(), Some(u64::MAX)); + } + + #[test] + fn handle_missing_values() { + let decoder = decoder::(); + + assert!(decoder.decode(3).unwrap().is_none()); + assert!(decoder.decode(4).unwrap().is_none()); + } + + #[test] + fn fail_to_create_decoder_with_invalid_type() { + ArrayDecoder::::new(RECORD_BATCH.column_by_name("uint64").unwrap()) + .map(|_| ()) + .unwrap_err(); + } + } + + mod float16_decoder { + use super::*; + + fn decoder() -> Box>> + where + ArrayDecoder<'static, Float16Array>: Decoder>, + { + Box::new( + ArrayDecoder::::new(RECORD_BATCH.column_by_name("float16").unwrap()) + .unwrap(), + ) + } + + #[test] + fn decode_valid_big_decimal_values() { + let decoder = decoder::(); + + assert_eq!(decoder.decode(0).unwrap(), Some(BigDecimal::from(10.0))); + assert_eq!(decoder.decode(1).unwrap(), Some(BigDecimal::from(20.0))); + assert_eq!( + decoder.decode(2).unwrap(), + Some(BigDecimal::from(f16::MAX.to_f32())) + ); + } + + #[test] + fn handle_missing_values() { + let decoder = decoder::(); + + assert!(decoder.decode(3).unwrap().is_none()); + assert!(decoder.decode(4).unwrap().is_none()); + } + + #[test] + fn fail_to_create_decoder_with_invalid_type() { + ArrayDecoder::::new(RECORD_BATCH.column_by_name("float16").unwrap()) + .map(|_| ()) + .unwrap_err(); + } + } + + mod float32_decoder { + use super::*; + + fn decoder() -> Box>> + where + ArrayDecoder<'static, Float32Array>: Decoder>, + { + Box::new( + ArrayDecoder::::new(RECORD_BATCH.column_by_name("float32").unwrap()) + .unwrap(), + ) + } + + #[test] + fn decode_valid_big_decimal_values() { + let decoder = decoder::(); + + assert_eq!(decoder.decode(0).unwrap(), Some(BigDecimal::from(10.0))); + assert_eq!(decoder.decode(1).unwrap(), Some(BigDecimal::from(20.0))); + assert_eq!(decoder.decode(2).unwrap(), Some(BigDecimal::from(f32::MAX))); + } + + #[test] + fn handle_missing_values() { + let decoder = decoder::(); + + assert!(decoder.decode(3).unwrap().is_none()); + assert!(decoder.decode(4).unwrap().is_none()); + } + + #[test] + fn fail_to_create_decoder_with_invalid_type() { + ArrayDecoder::::new(RECORD_BATCH.column_by_name("float32").unwrap()) + .map(|_| ()) + .unwrap_err(); + } + } + + mod float64_decoder { + use super::*; + + fn decoder() -> Box>> + where + ArrayDecoder<'static, Float64Array>: Decoder>, + { + Box::new( + ArrayDecoder::::new(RECORD_BATCH.column_by_name("float64").unwrap()) + .unwrap(), + ) + } + + #[test] + fn decode_valid_big_decimal_values() { + let decoder = decoder::(); + + assert_eq!(decoder.decode(0).unwrap(), Some(BigDecimal::from(10.0))); + assert_eq!(decoder.decode(1).unwrap(), Some(BigDecimal::from(20.0))); + assert_eq!(decoder.decode(2).unwrap(), Some(BigDecimal::from(f64::MAX))); + } + + #[test] + fn handle_missing_values() { + let decoder = decoder::(); + + assert!(decoder.decode(3).unwrap().is_none()); + assert!(decoder.decode(4).unwrap().is_none()); + } + + #[test] + fn fail_to_create_decoder_with_invalid_type() { + ArrayDecoder::::new(RECORD_BATCH.column_by_name("float64").unwrap()) + .map(|_| ()) + .unwrap_err(); + } + } + + mod decimal128_decoder_without_scale { + use super::*; + + fn decoder() -> Box>> + where + ArrayDecoder<'static, Decimal128Array>: Decoder>, + { + Box::new( + ArrayDecoder::::new( + RECORD_BATCH.column_by_name("decimal128").unwrap(), + ) + .unwrap(), + ) + } + + #[test] + fn decode_valid_i32_values() { + let decoder = decoder::(); + + assert_eq!(decoder.decode(0).unwrap(), Some(10i32)); + assert_eq!(decoder.decode(1).unwrap(), Some(20i32)); + } + + #[test] + fn fail_to_decode_i32_values_from_larger_values() { + let decoder = decoder::(); + + decoder.decode(2).unwrap_err(); + } + + #[test] + fn decode_valid_i64_values() { + let decoder = decoder::(); + + assert_eq!(decoder.decode(0).unwrap(), Some(10i64)); + assert_eq!(decoder.decode(1).unwrap(), Some(20i64)); + } + + #[test] + fn fail_to_decode_i64_values_from_larger_values() { + let decoder = decoder::(); + + decoder.decode(2).unwrap_err(); + } + + #[test] + fn decode_valid_big_int_values() { + let decoder = decoder::(); + + assert_eq!(decoder.decode(0).unwrap(), Some(BigInt::from(10))); + assert_eq!(decoder.decode(1).unwrap(), Some(BigInt::from(20))); + assert_eq!(decoder.decode(2).unwrap(), Some(BigInt::from(i128::MAX))); + } + + #[test] + fn decode_valid_big_decimal_values() { + let decoder = decoder::(); + + assert_eq!(decoder.decode(0).unwrap(), Some(BigDecimal::from(10))); + assert_eq!(decoder.decode(1).unwrap(), Some(BigDecimal::from(20))); + assert_eq!( + decoder.decode(2).unwrap(), + Some(BigDecimal::from(i128::MAX)) + ); + } + + #[test] + fn handle_missing_values() { + let decoder = decoder::(); + + assert!(decoder.decode(3).unwrap().is_none()); + assert!(decoder.decode(4).unwrap().is_none()); + } + + #[test] + fn fail_to_create_decoder_with_invalid_type() { + ArrayDecoder::::new(RECORD_BATCH.column_by_name("decimal128").unwrap()) + .map(|_| ()) + .unwrap_err(); + } + } + + mod decimal128_decoder_with_scale { + use super::*; + + fn decoder() -> Box>> + where + ArrayDecoder<'static, Decimal128Array>: Decoder>, + { + Box::new( + ArrayDecoder::::new( + RECORD_BATCH + .column_by_name("decimal128_with_scale") + .unwrap(), + ) + .unwrap(), + ) + } + + #[test] + fn fail_to_decode_i32_values() { + let decoder = decoder::(); + + decoder.decode(0).unwrap_err(); + decoder.decode(1).unwrap_err(); + decoder.decode(2).unwrap_err(); + } + + #[test] + fn fail_to_decode_i64_values() { + let decoder = decoder::(); + + decoder.decode(0).unwrap_err(); + decoder.decode(1).unwrap_err(); + decoder.decode(2).unwrap_err(); + } + + #[test] + fn fail_to_decode_big_int_values() { + let decoder = decoder::(); + + decoder.decode(0).unwrap_err(); + decoder.decode(1).unwrap_err(); + decoder.decode(2).unwrap_err(); + } + + #[test] + fn decode_valid_big_decimal_values() { + let decoder = decoder::(); + + assert_eq!( + decoder.decode(0).unwrap(), + Some(BigDecimal::new(10.into(), -10)) + ); + assert_eq!( + decoder.decode(1).unwrap(), + Some(BigDecimal::new(20.into(), -10)) + ); + assert_eq!( + decoder.decode(2).unwrap(), + Some(BigDecimal::new(i128::MAX.into(), -10)) + ); + } + } + + mod decimal256_decoder_without_scale { + use super::*; + + fn decoder() -> Box>> + where + ArrayDecoder<'static, Decimal256Array>: Decoder>, + { + Box::new( + ArrayDecoder::::new( + RECORD_BATCH.column_by_name("decimal256").unwrap(), + ) + .unwrap(), + ) + } + + #[test] + fn decode_valid_i32_values() { + let decoder = decoder::(); + + assert_eq!(decoder.decode(0).unwrap(), Some(10i32)); + assert_eq!(decoder.decode(1).unwrap(), Some(20i32)); + } + + #[test] + fn fail_to_decode_i32_values_from_larger_values() { + let decoder = decoder::(); + + decoder.decode(2).unwrap_err(); + } + + #[test] + fn decode_valid_i64_values() { + let decoder = decoder::(); + + assert_eq!(decoder.decode(0).unwrap(), Some(10i64)); + assert_eq!(decoder.decode(1).unwrap(), Some(20i64)); + } + + #[test] + fn fail_to_decode_i64_values_from_larger_values() { + let decoder = decoder::(); + + decoder.decode(2).unwrap_err(); + } + + #[test] + fn decode_valid_big_int_values() { + let decoder = decoder::(); + + assert_eq!(decoder.decode(0).unwrap(), Some(BigInt::from(10))); + assert_eq!(decoder.decode(1).unwrap(), Some(BigInt::from(20))); + assert_eq!( + decoder.decode(2).unwrap(), + Some(BigInt::from_signed_bytes_be(&i256::MAX.to_be_bytes()).unwrap()) + ); + } + + #[test] + fn decode_valid_big_decimal_values() { + let decoder = decoder::(); + + assert_eq!(decoder.decode(0).unwrap(), Some(BigDecimal::from(10))); + assert_eq!(decoder.decode(1).unwrap(), Some(BigDecimal::from(20))); + assert_eq!( + decoder.decode(2).unwrap(), + Some(BigDecimal::new( + BigInt::from_signed_bytes_be(&i256::MAX.to_be_bytes()).unwrap(), + 0 + )) + ); + } + + #[test] + fn handle_missing_values() { + let decoder = decoder::(); + + assert!(decoder.decode(3).unwrap().is_none()); + assert!(decoder.decode(4).unwrap().is_none()); + } + + #[test] + fn fail_to_create_decoder_with_invalid_type() { + ArrayDecoder::::new(RECORD_BATCH.column_by_name("decimal256").unwrap()) + .map(|_| ()) + .unwrap_err(); + } + } + + mod decimal256_decoder_with_scale { + use super::*; + + fn decoder() -> Box>> + where + ArrayDecoder<'static, Decimal256Array>: Decoder>, + { + Box::new( + ArrayDecoder::::new( + RECORD_BATCH + .column_by_name("decimal256_with_scale") + .unwrap(), + ) + .unwrap(), + ) + } + + #[test] + fn fail_to_decode_i32_values() { + let decoder = decoder::(); + + decoder.decode(0).unwrap_err(); + decoder.decode(1).unwrap_err(); + decoder.decode(2).unwrap_err(); + } + + #[test] + fn fail_to_decode_i64_values() { + let decoder = decoder::(); + + decoder.decode(0).unwrap_err(); + decoder.decode(1).unwrap_err(); + decoder.decode(2).unwrap_err(); + } + + #[test] + fn fail_to_decode_big_int_values() { + let decoder = decoder::(); + + decoder.decode(0).unwrap_err(); + decoder.decode(1).unwrap_err(); + decoder.decode(2).unwrap_err(); + } + + #[test] + fn decode_valid_big_decimal_values() { + let decoder = decoder::(); + + assert_eq!( + decoder.decode(0).unwrap(), + Some(BigDecimal::new(10.into(), -10)) + ); + assert_eq!( + decoder.decode(1).unwrap(), + Some(BigDecimal::new(20.into(), -10)) + ); + assert_eq!( + decoder.decode(2).unwrap(), + Some(BigDecimal::new( + BigInt::from_signed_bytes_be(&i256::MAX.to_be_bytes()).unwrap(), + -10 + )) + ); + } + } + + mod utf8_decoder { + use super::*; + + fn decoder() -> Box>> + where + ArrayDecoder<'static, StringArray>: Decoder>, + { + Box::new( + ArrayDecoder::::new(RECORD_BATCH.column_by_name("utf8").unwrap()) + .unwrap(), + ) + } + + #[test] + fn decode_valid_string_values() { + let decoder = decoder::(); + + assert_eq!(decoder.decode(0).unwrap(), Some("aa".to_string())); + assert_eq!(decoder.decode(1).unwrap(), Some("bb".to_string())); + assert_eq!(decoder.decode(2).unwrap(), Some("30".to_string())); + } + + #[test] + fn decode_valid_big_int_values() { + let decoder = decoder::(); + + assert_eq!(decoder.decode(2).unwrap(), Some(BigInt::from(30))); + } + + #[test] + fn fail_to_decode_big_int_values_from_non_numeric_values() { + let decoder = decoder::(); + + decoder.decode(0).unwrap_err(); + decoder.decode(1).unwrap_err(); + } + + #[test] + fn decode_valid_big_decimal_values() { + let decoder = decoder::(); + + assert_eq!(decoder.decode(2).unwrap(), Some(BigDecimal::from(30))); + } + + #[test] + fn fail_to_decode_big_decimal_values_from_non_numeric_values() { + let decoder = decoder::(); + + decoder.decode(0).unwrap_err(); + decoder.decode(1).unwrap_err(); + } + + #[test] + fn handle_missing_values() { + let decoder = decoder::(); + + assert!(decoder.decode(3).unwrap().is_none()); + assert!(decoder.decode(4).unwrap().is_none()); + } + + #[test] + fn fail_to_create_decoder_with_invalid_type() { + ArrayDecoder::::new(RECORD_BATCH.column_by_name("utf8").unwrap()) + .map(|_| ()) + .unwrap_err(); + } + } + + mod utf8_view_decoder { + use super::*; + + fn decoder() -> Box>> + where + ArrayDecoder<'static, StringViewArray>: Decoder>, + { + Box::new( + ArrayDecoder::::new( + RECORD_BATCH.column_by_name("utf8_view").unwrap(), + ) + .unwrap(), + ) + } + + #[test] + fn decode_valid_string_values() { + let decoder = decoder::(); + + assert_eq!(decoder.decode(0).unwrap(), Some("aa".to_string())); + assert_eq!(decoder.decode(1).unwrap(), Some("bb".to_string())); + assert_eq!(decoder.decode(2).unwrap(), Some("30".to_string())); + } + + #[test] + fn decode_valid_big_int_values() { + let decoder = decoder::(); + + assert_eq!(decoder.decode(2).unwrap(), Some(BigInt::from(30))); + } + + #[test] + fn fail_to_decode_big_int_values_from_non_numeric_values() { + let decoder = decoder::(); + + decoder.decode(0).unwrap_err(); + decoder.decode(1).unwrap_err(); + } + + #[test] + fn decode_valid_big_decimal_values() { + let decoder = decoder::(); + + assert_eq!(decoder.decode(2).unwrap(), Some(BigDecimal::from(30))); + } + + #[test] + fn fail_to_decode_big_decimal_values_from_non_numeric_values() { + let decoder = decoder::(); + + decoder.decode(0).unwrap_err(); + decoder.decode(1).unwrap_err(); + } + + #[test] + fn handle_missing_values() { + let decoder = decoder::(); + + assert!(decoder.decode(3).unwrap().is_none()); + assert!(decoder.decode(4).unwrap().is_none()); + } + + #[test] + fn fail_to_create_decoder_with_invalid_type() { + ArrayDecoder::::new(RECORD_BATCH.column_by_name("utf8_view").unwrap()) + .map(|_| ()) + .unwrap_err(); + } + } + + mod large_utf8_decoder { + use super::*; + + fn decoder() -> Box>> + where + ArrayDecoder<'static, LargeStringArray>: Decoder>, + { + Box::new( + ArrayDecoder::::new( + RECORD_BATCH.column_by_name("large_utf8").unwrap(), + ) + .unwrap(), + ) + } + + #[test] + fn decode_valid_string_values() { + let decoder = decoder::(); + + assert_eq!(decoder.decode(0).unwrap(), Some("aa".to_string())); + assert_eq!(decoder.decode(1).unwrap(), Some("bb".to_string())); + assert_eq!(decoder.decode(2).unwrap(), Some("30".to_string())); + } + + #[test] + fn decode_valid_big_int_values() { + let decoder = decoder::(); + + assert_eq!(decoder.decode(2).unwrap(), Some(BigInt::from(30))); + } + + #[test] + fn fail_to_decode_big_int_values_from_non_numeric_values() { + let decoder = decoder::(); + + decoder.decode(0).unwrap_err(); + decoder.decode(1).unwrap_err(); + } + + #[test] + fn decode_valid_big_decimal_values() { + let decoder = decoder::(); + + assert_eq!(decoder.decode(2).unwrap(), Some(BigDecimal::from(30))); + } + + #[test] + fn fail_to_decode_big_decimal_values_from_non_numeric_values() { + let decoder = decoder::(); + + decoder.decode(0).unwrap_err(); + decoder.decode(1).unwrap_err(); + } + + #[test] + fn handle_missing_values() { + let decoder = decoder::(); + + assert!(decoder.decode(3).unwrap().is_none()); + assert!(decoder.decode(4).unwrap().is_none()); + } + + #[test] + fn fail_to_create_decoder_with_invalid_type() { + ArrayDecoder::::new(RECORD_BATCH.column_by_name("large_utf8").unwrap()) + .map(|_| ()) + .unwrap_err(); + } + } + + mod binary_decoder { + use super::*; + + fn decoder() -> Box>> + where + ArrayDecoder<'static, BinaryArray>: Decoder>, + { + Box::new( + ArrayDecoder::::new(RECORD_BATCH.column_by_name("binary").unwrap()) + .unwrap(), + ) + } + + #[test] + fn decode_valid_binary_values() { + let decoder = decoder::>(); + + assert_eq!(decoder.decode(0).unwrap(), Some((b"aa".as_slice()).into())); + assert_eq!(decoder.decode(1).unwrap(), Some((b"bb".as_slice()).into())); + assert_eq!(decoder.decode(2).unwrap(), Some((b"cc".as_slice()).into())); + } + + #[test] + fn handle_missing_values() { + let decoder = decoder::>(); + + assert!(decoder.decode(3).unwrap().is_none()); + assert!(decoder.decode(4).unwrap().is_none()); + } + + #[test] + fn fail_to_create_decoder_with_invalid_type() { + ArrayDecoder::::new(RECORD_BATCH.column_by_name("binary").unwrap()) + .map(|_| ()) + .unwrap_err(); + } + } + + mod binary_view_decoder { + use super::*; + + fn decoder() -> Box>> + where + ArrayDecoder<'static, BinaryViewArray>: Decoder>, + { + Box::new( + ArrayDecoder::::new( + RECORD_BATCH.column_by_name("binary_view").unwrap(), + ) + .unwrap(), + ) + } + + #[test] + fn decode_valid_binary_values() { + let decoder = decoder::>(); + + assert_eq!(decoder.decode(0).unwrap(), Some((b"aa".as_slice()).into())); + assert_eq!(decoder.decode(1).unwrap(), Some((b"bb".as_slice()).into())); + assert_eq!(decoder.decode(2).unwrap(), Some((b"cc".as_slice()).into())); + } + + #[test] + fn handle_missing_values() { + let decoder = decoder::>(); + + assert!(decoder.decode(3).unwrap().is_none()); + assert!(decoder.decode(4).unwrap().is_none()); + } + + #[test] + fn fail_to_create_decoder_with_invalid_type() { + ArrayDecoder::::new(RECORD_BATCH.column_by_name("binary_view").unwrap()) + .map(|_| ()) + .unwrap_err(); + } + } + + mod fixed_size_binary_decoder { + use super::*; + + fn decoder() -> Box>> + where + ArrayDecoder<'static, FixedSizeBinaryArray>: Decoder>, + { + Box::new( + ArrayDecoder::::new( + RECORD_BATCH.column_by_name("fixed_size_binary").unwrap(), + ) + .unwrap(), + ) + } + + #[test] + fn decode_valid_binary_values() { + let decoder = decoder::>(); + + assert_eq!(decoder.decode(0).unwrap(), Some((b"aa".as_slice()).into())); + assert_eq!(decoder.decode(1).unwrap(), Some((b"bb".as_slice()).into())); + assert_eq!(decoder.decode(2).unwrap(), Some((b"cc".as_slice()).into())); + } + + #[test] + fn handle_missing_values() { + let decoder = decoder::>(); + + assert!(decoder.decode(3).unwrap().is_none()); + assert!(decoder.decode(4).unwrap().is_none()); + } + + #[test] + fn fail_to_decode_b256_values_from_invalid_binary_size() { + let decoder = decoder::(); + + decoder.decode(0).unwrap_err(); + decoder.decode(1).unwrap_err(); + decoder.decode(2).unwrap_err(); + } + + #[test] + fn fail_to_create_decoder_with_invalid_type() { + ArrayDecoder::::new( + RECORD_BATCH.column_by_name("fixed_size_binary").unwrap(), + ) + .map(|_| ()) + .unwrap_err(); + } + } + + mod fixed_size_binary_32_decoder { + use super::*; + + fn decoder() -> Box>> + where + ArrayDecoder<'static, FixedSizeBinaryArray>: Decoder>, + { + Box::new( + ArrayDecoder::::new( + RECORD_BATCH.column_by_name("fixed_size_binary_32").unwrap(), + ) + .unwrap(), + ) + } + + #[test] + fn decode_valid_b256_values() { + let decoder = decoder::(); + + assert_eq!(decoder.decode(0).unwrap(), Some(B256::from([10u8; 32]))); + assert_eq!(decoder.decode(1).unwrap(), Some(B256::from([20u8; 32]))); + assert_eq!(decoder.decode(2).unwrap(), Some(B256::from([30u8; 32]))); + } + } + + mod large_binary_decoder { + use super::*; + + fn decoder() -> Box>> + where + ArrayDecoder<'static, LargeBinaryArray>: Decoder>, + { + Box::new( + ArrayDecoder::::new( + RECORD_BATCH.column_by_name("large_binary").unwrap(), + ) + .unwrap(), + ) + } + + #[test] + fn decode_valid_binary_values() { + let decoder = decoder::>(); + + assert_eq!(decoder.decode(0).unwrap(), Some((b"aa".as_slice()).into())); + assert_eq!(decoder.decode(1).unwrap(), Some((b"bb".as_slice()).into())); + assert_eq!(decoder.decode(2).unwrap(), Some((b"cc".as_slice()).into())); + } + + #[test] + fn handle_missing_values() { + let decoder = decoder::>(); + + assert!(decoder.decode(3).unwrap().is_none()); + assert!(decoder.decode(4).unwrap().is_none()); + } + + #[test] + fn fail_to_create_decoder_with_invalid_type() { + ArrayDecoder::::new(RECORD_BATCH.column_by_name("large_binary").unwrap()) + .map(|_| ()) + .unwrap_err(); + } + } + + mod timestamp_second_decoder { + use super::*; + + fn decoder() -> Box>> + where + ArrayDecoder<'static, TimestampSecondArray>: Decoder>, + { + Box::new( + ArrayDecoder::::new( + RECORD_BATCH.column_by_name("timestamp_second").unwrap(), + ) + .unwrap(), + ) + } + + #[test] + fn decode_valid_values() { + let decoder = decoder::>(); + + assert_eq!( + decoder.decode(0).unwrap(), + Some(Utc.with_ymd_and_hms(2020, 1, 1, 0, 0, 0).unwrap()) + ); + assert_eq!( + decoder.decode(1).unwrap(), + Some(Utc.with_ymd_and_hms(2020, 10, 10, 10, 10, 10).unwrap()) + ); + assert_eq!( + decoder.decode(2).unwrap(), + Some(Utc.with_ymd_and_hms(2020, 12, 31, 23, 59, 59).unwrap()) + ); + } + + #[test] + fn handle_missing_values() { + let decoder = decoder::>(); + + assert!(decoder.decode(3).unwrap().is_none()); + assert!(decoder.decode(4).unwrap().is_none()); + } + + #[test] + fn fail_to_create_decoder_with_invalid_type() { + ArrayDecoder::::new( + RECORD_BATCH.column_by_name("timestamp_second").unwrap(), + ) + .map(|_| ()) + .unwrap_err(); + } + } + + mod timestamp_millisecond_decoder { + use super::*; + + fn decoder() -> Box>> + where + ArrayDecoder<'static, TimestampMillisecondArray>: Decoder>, + { + Box::new( + ArrayDecoder::::new( + RECORD_BATCH + .column_by_name("timestamp_millisecond") + .unwrap(), + ) + .unwrap(), + ) + } + + #[test] + fn decode_valid_values() { + let decoder = decoder::>(); + + assert_eq!( + decoder.decode(0).unwrap(), + Some(Utc.with_ymd_and_hms(2020, 1, 1, 0, 0, 0).unwrap()) + ); + assert_eq!( + decoder.decode(1).unwrap(), + Some(Utc.with_ymd_and_hms(2020, 10, 10, 10, 10, 10).unwrap()) + ); + assert_eq!( + decoder.decode(2).unwrap(), + Some(Utc.with_ymd_and_hms(2020, 12, 31, 23, 59, 59).unwrap()) + ); + } + + #[test] + fn handle_missing_values() { + let decoder = decoder::>(); + + assert!(decoder.decode(3).unwrap().is_none()); + assert!(decoder.decode(4).unwrap().is_none()); + } + + #[test] + fn fail_to_create_decoder_with_invalid_type() { + ArrayDecoder::::new( + RECORD_BATCH + .column_by_name("timestamp_millisecond") + .unwrap(), + ) + .map(|_| ()) + .unwrap_err(); + } + } + + mod timestamp_microsecond_decoder { + use super::*; + + fn decoder() -> Box>> + where + ArrayDecoder<'static, TimestampMicrosecondArray>: Decoder>, + { + Box::new( + ArrayDecoder::::new( + RECORD_BATCH + .column_by_name("timestamp_microsecond") + .unwrap(), + ) + .unwrap(), + ) + } + + #[test] + fn decode_valid_values() { + let decoder = decoder::>(); + + assert_eq!( + decoder.decode(0).unwrap(), + Some(Utc.with_ymd_and_hms(2020, 1, 1, 0, 0, 0).unwrap()) + ); + assert_eq!( + decoder.decode(1).unwrap(), + Some(Utc.with_ymd_and_hms(2020, 10, 10, 10, 10, 10).unwrap()) + ); + assert_eq!( + decoder.decode(2).unwrap(), + Some(Utc.with_ymd_and_hms(2020, 12, 31, 23, 59, 59).unwrap()) + ); + } + + #[test] + fn handle_missing_values() { + let decoder = decoder::>(); + + assert!(decoder.decode(3).unwrap().is_none()); + assert!(decoder.decode(4).unwrap().is_none()); + } + + #[test] + fn fail_to_create_decoder_with_invalid_type() { + ArrayDecoder::::new( + RECORD_BATCH + .column_by_name("timestamp_microsecond") + .unwrap(), + ) + .map(|_| ()) + .unwrap_err(); + } + } + + mod timestamp_nanosecond_decoder { + use super::*; + + fn decoder() -> Box>> + where + ArrayDecoder<'static, TimestampNanosecondArray>: Decoder>, + { + Box::new( + ArrayDecoder::::new( + RECORD_BATCH.column_by_name("timestamp_nanosecond").unwrap(), + ) + .unwrap(), + ) + } + + #[test] + fn decode_valid_values() { + let decoder = decoder::>(); + + assert_eq!( + decoder.decode(0).unwrap(), + Some(Utc.with_ymd_and_hms(2020, 1, 1, 0, 0, 0).unwrap()) + ); + assert_eq!( + decoder.decode(1).unwrap(), + Some(Utc.with_ymd_and_hms(2020, 10, 10, 10, 10, 10).unwrap()) + ); + assert_eq!( + decoder.decode(2).unwrap(), + Some(Utc.with_ymd_and_hms(2020, 12, 31, 23, 59, 59).unwrap()) + ); + } + + #[test] + fn handle_missing_values() { + let decoder = decoder::>(); + + assert!(decoder.decode(3).unwrap().is_none()); + assert!(decoder.decode(4).unwrap().is_none()); + } + + #[test] + fn fail_to_create_decoder_with_invalid_type() { + ArrayDecoder::::new( + RECORD_BATCH.column_by_name("timestamp_nanosecond").unwrap(), + ) + .map(|_| ()) + .unwrap_err(); + } + } +} diff --git a/graph/src/amp/codec/mod.rs b/graph/src/amp/codec/mod.rs index cea10174cc2..b642d0377c9 100644 --- a/graph/src/amp/codec/mod.rs +++ b/graph/src/amp/codec/mod.rs @@ -5,6 +5,9 @@ mod mapping_decoder; mod name_cache; mod value_decoder; +#[cfg(test)] +mod test_fixtures; + pub mod utils; use std::{ @@ -236,3 +239,273 @@ impl Codec { self.name_cache.ident(name.as_ref()) } } + +#[cfg(test)] +mod tests { + use std::sync::LazyLock; + + use arrow::array::{BinaryArray, BooleanArray, Int64Array, Int8Array}; + use arrow::datatypes::{DataType, Field, Schema}; + + use crate::data::subgraph::DeploymentHash; + + use super::*; + + static SCHEMA: LazyLock = LazyLock::new(|| { + InputSchema::parse_latest( + r#" + type Id @entity { + id: Int8! + } + + type BlockNumber @entity { + id: Int8! + blockNumber: BigInt! + } + + type OptionalBlockNumber @entity { + id: Int8! + blockNumber: BigInt + } + + type Block @entity { + id: Int8! + number: Int8! + hash: Bytes! + value: BigInt + } + "#, + DeploymentHash::default(), + ) + .unwrap() + }); + + #[inline] + fn new_codec() -> Codec { + Codec::new(SCHEMA.clone()) + } + + #[test] + fn fail_to_decode_unknown_entity() { + let schema = Schema::new(vec![Field::new("some_field", DataType::Boolean, true)]); + let record_batch = RecordBatch::new_empty(schema.into()); + + let mut codec = new_codec(); + let e = codec + .decode(record_batch, "SomeEntity") + .map(|_| ()) + .unwrap_err(); + + assert!(format!("{e:#}").contains("entity not found")) + } + + #[test] + fn do_not_fail_on_empty_record_batch() { + let schema = Schema::new(vec![Field::new("some_field", DataType::Boolean, true)]); + let record_batch = RecordBatch::new_empty(schema.into()); + + let mut codec = new_codec(); + let decode_output = codec.decode(record_batch, "Id").unwrap(); + + assert!(decode_output.decoded_entities.is_empty()); + } + + #[test] + fn allow_entity_ids_to_be_auto_generated() { + let schema = Schema::new(vec![Field::new("some_field", DataType::Boolean, true)]); + let record_batch = RecordBatch::try_new( + schema.into(), + vec![Arc::new(BooleanArray::from(vec![true, false]))], + ) + .unwrap(); + + let mut codec = new_codec(); + let decode_output = codec.decode(record_batch, "Id").unwrap(); + let decoded_entities = decode_output.decoded_entities; + + assert_eq!(decoded_entities.len(), 2); + + for decoded_entity in decoded_entities { + assert!(decoded_entity.key.is_none()); + assert!(decoded_entity.entity_data.is_empty()); + } + } + + #[test] + fn decode_entity_ids() { + let schema = Schema::new(vec![Field::new("id", DataType::Int8, true)]); + let record_batch = RecordBatch::try_new( + schema.into(), + vec![Arc::new(Int8Array::from(vec![10, 20, 30]))], + ) + .unwrap(); + + let mut codec = new_codec(); + let decode_output = codec.decode(record_batch, "Id").unwrap(); + let decoded_entities = decode_output.decoded_entities; + + assert_eq!(decoded_entities.len(), 3); + + assert_eq!( + decoded_entities[0].key.as_ref().unwrap().entity_id, + Id::Int8(10), + ); + assert_eq!( + &decoded_entities[0].entity_data, + &[(Word::from("id"), Value::Int8(10))], + ); + + assert_eq!( + decoded_entities[1].key.as_ref().unwrap().entity_id, + Id::Int8(20) + ); + assert_eq!( + &decoded_entities[1].entity_data, + &[(Word::from("id"), Value::Int8(20))], + ); + + assert_eq!( + decoded_entities[2].key.as_ref().unwrap().entity_id, + Id::Int8(30) + ); + assert_eq!( + &decoded_entities[2].entity_data, + &[(Word::from("id"), Value::Int8(30))], + ); + } + + #[test] + fn fail_to_decode_entity_when_a_required_field_is_missing() { + let schema = Schema::new(vec![Field::new("some_field", DataType::Int8, true)]); + let record_batch = + RecordBatch::try_new(schema.into(), vec![Arc::new(Int8Array::from(vec![10]))]).unwrap(); + + let mut codec = new_codec(); + let e = codec + .decode(record_batch, "BlockNumber") + .map(|_| ()) + .unwrap_err(); + + assert!(format!("{e:#}").contains("failed to get column for field 'blockNumber'")); + } + + #[test] + fn decode_entity_when_an_optional_field_is_missing() { + let schema = Schema::new(vec![Field::new("some_field", DataType::Int8, true)]); + let record_batch = + RecordBatch::try_new(schema.into(), vec![Arc::new(Int8Array::from(vec![10]))]).unwrap(); + + let mut codec = new_codec(); + let decode_output = codec.decode(record_batch, "OptionalBlockNumber").unwrap(); + let decoded_entitites = decode_output.decoded_entities; + + assert_eq!(decoded_entitites.len(), 1); + assert!(decoded_entitites[0].entity_data.is_empty()); + } + + #[test] + fn match_entity_field_name_with_column_name_ignoring_case() { + for column_name in [ + "block_number", + "Block_Number", + "BLOCK_NUMBER", + "blocknumber", + "blockNumber", + "BlockNumber", + "BLOCKNUMBER", + ] { + let schema = Schema::new(vec![Field::new(column_name, DataType::Int8, true)]); + let record_batch = RecordBatch::try_new( + schema.into(), + vec![Arc::new(Int8Array::from(vec![10, 20, 30]))], + ) + .unwrap(); + + let mut codec = new_codec(); + let decode_output = codec.decode(record_batch, "BlockNumber").unwrap(); + let decoded_entitites = decode_output.decoded_entities; + + assert_eq!(decoded_entitites.len(), 3); + + assert_eq!( + &decoded_entitites[0].entity_data, + &[(Word::from("blockNumber"), Value::BigInt(10.into()))] + ); + assert_eq!( + &decoded_entitites[1].entity_data, + &[(Word::from("blockNumber"), Value::BigInt(20.into()))] + ); + assert_eq!( + &decoded_entitites[2].entity_data, + &[(Word::from("blockNumber"), Value::BigInt(30.into()))] + ); + } + } + + #[test] + fn fail_to_decode_entity_when_field_type_and_column_type_are_incompatible() { + let schema = Schema::new(vec![Field::new("block_number", DataType::Boolean, true)]); + let record_batch = RecordBatch::try_new( + schema.into(), + vec![Arc::new(BooleanArray::from(vec![true]))], + ) + .unwrap(); + + let mut codec = new_codec(); + let e = codec + .decode(record_batch, "BlockNumber") + .map(|_| ()) + .unwrap_err(); + + assert!(format!("{e:#}").contains("failed to create decoder for field 'blockNumber'")) + } + + #[test] + fn decode_entities_with_multiple_fields() { + let schema = Schema::new(vec![ + Field::new("number", DataType::Int8, true), + Field::new("hash", DataType::Binary, true), + Field::new("value", DataType::Int64, true), + ]); + let record_batch = RecordBatch::try_new( + schema.into(), + vec![ + Arc::new(Int8Array::from(vec![10, 20, 30])), + Arc::new(BinaryArray::from(vec![b"aa".as_ref(), b"bb", b"cc"])), + Arc::new(Int64Array::from(vec![100, 200, 300])), + ], + ) + .unwrap(); + + let mut codec = new_codec(); + let decode_output = codec.decode(record_batch, "Block").unwrap(); + let decoded_entitites = decode_output.decoded_entities; + + assert_eq!(decoded_entitites.len(), 3); + + assert_eq!( + &decoded_entitites[0].entity_data, + &[ + (Word::from("hash"), Value::Bytes(b"aa".as_ref().into())), + (Word::from("number"), Value::Int8(10)), + (Word::from("value"), Value::BigInt(100.into())) + ] + ); + assert_eq!( + &decoded_entitites[1].entity_data, + &[ + (Word::from("hash"), Value::Bytes(b"bb".as_ref().into())), + (Word::from("number"), Value::Int8(20)), + (Word::from("value"), Value::BigInt(200.into())) + ] + ); + assert_eq!( + &decoded_entitites[2].entity_data, + &[ + (Word::from("hash"), Value::Bytes(b"cc".as_ref().into())), + (Word::from("number"), Value::Int8(30)), + (Word::from("value"), Value::BigInt(300.into())) + ] + ); + } +} diff --git a/graph/src/amp/codec/name_cache.rs b/graph/src/amp/codec/name_cache.rs index ea4419c75a4..9ad28f7a3b1 100644 --- a/graph/src/amp/codec/name_cache.rs +++ b/graph/src/amp/codec/name_cache.rs @@ -26,7 +26,7 @@ impl NameCache { return ident.cheap_clone(); } - let ident: Arc = name.to_camel_case().into(); + let ident: Arc = name.to_camel_case().to_lowercase().into(); self.cache.insert(name.into(), ident.cheap_clone()); ident diff --git a/graph/src/amp/codec/test_fixtures.rs b/graph/src/amp/codec/test_fixtures.rs new file mode 100644 index 00000000000..a55001439b2 --- /dev/null +++ b/graph/src/amp/codec/test_fixtures.rs @@ -0,0 +1,364 @@ +use std::sync::{Arc, LazyLock}; + +use arrow::{ + array::{ + BinaryArray, BinaryViewArray, BooleanArray, BooleanBuilder, Decimal128Builder, + Decimal256Builder, FixedSizeBinaryArray, FixedSizeListBuilder, Float16Array, Float32Array, + Float64Array, Int16Array, Int32Array, Int64Array, Int8Array, LargeBinaryArray, + LargeListBuilder, LargeListViewBuilder, LargeStringArray, ListBuilder, ListViewBuilder, + RecordBatch, StringArray, StringViewArray, TimestampMicrosecondArray, + TimestampMillisecondArray, TimestampNanosecondArray, TimestampSecondArray, UInt16Array, + UInt32Array, UInt64Array, UInt8Array, + }, + datatypes::{ + i256, DataType, Field, Schema, TimeUnit, DECIMAL128_MAX_PRECISION, DECIMAL256_MAX_PRECISION, + }, +}; +use chrono::{TimeZone, Utc}; +use half::f16; + +pub static RECORD_BATCH: LazyLock = LazyLock::new(|| { + let record_batches = [ + &BOOLEAN_RECORD_BATCH, + &INT_RECORD_BATCH, + &UINT_RECORD_BATCH, + &DECIMAL_RECORD_BATCH, + &FLOAT_RECORD_BATCH, + &STRING_RECORD_BATCH, + &BINARY_RECORD_BATCH, + &TIMESTAMP_RECORD_BATCH, + ]; + + let schemas = record_batches + .iter() + .map(|record_batch| (*record_batch.schema()).clone()); + + let columns = record_batches + .into_iter() + .map(|record_batch| record_batch.columns()) + .flatten() + .map(|column| column.clone()) + .collect::>(); + + RecordBatch::try_new(Schema::try_merge(schemas).unwrap().into(), columns).unwrap() +}); + +pub static BOOLEAN_RECORD_BATCH: LazyLock = LazyLock::new(|| { + let schema = Schema::new(vec![ + Field::new("boolean", DataType::Boolean, true), + Field::new( + "boolean_list", + DataType::List(Arc::new(Field::new("item", DataType::Boolean, true))), + true, + ), + Field::new( + "boolean_list_view", + DataType::ListView(Arc::new(Field::new("item", DataType::Boolean, true))), + true, + ), + Field::new( + "boolean_fixed_size_list", + DataType::FixedSizeList(Arc::new(Field::new("item", DataType::Boolean, true)), 3), + true, + ), + Field::new( + "boolean_large_list", + DataType::LargeList(Arc::new(Field::new("item", DataType::Boolean, true))), + true, + ), + Field::new( + "boolean_large_list_view", + DataType::LargeListView(Arc::new(Field::new("item", DataType::Boolean, true))), + true, + ), + ]); + + let builder = || { + let mut builder = BooleanBuilder::new(); + builder.append_value(true); + builder.append_value(false); + builder.append_value(true); + builder + }; + + RecordBatch::try_new( + schema.into(), + vec![ + Arc::new(BooleanArray::from(vec![true, false, true])), + Arc::new({ + let mut list_builder = ListBuilder::new(builder()); + list_builder.append(true); + list_builder.append(false); + list_builder.append(false); + list_builder.finish() + }), + Arc::new({ + let mut list_builder = ListViewBuilder::new(builder()); + list_builder.append(true); + list_builder.append(false); + list_builder.append(false); + list_builder.finish() + }), + Arc::new({ + let mut list_builder = FixedSizeListBuilder::new(builder(), 3); + list_builder.append(true); + list_builder.values().append_null(); + list_builder.values().append_null(); + list_builder.values().append_null(); + list_builder.append(false); + list_builder.values().append_null(); + list_builder.values().append_null(); + list_builder.values().append_null(); + list_builder.append(false); + list_builder.finish() + }), + Arc::new({ + let mut list_builder = LargeListBuilder::new(builder()); + list_builder.append(true); + list_builder.append(false); + list_builder.append(false); + list_builder.finish() + }), + Arc::new({ + let mut list_builder = LargeListViewBuilder::new(builder()); + list_builder.append(true); + list_builder.append(false); + list_builder.append(false); + list_builder.finish() + }), + ], + ) + .unwrap() +}); + +pub static INT_RECORD_BATCH: LazyLock = LazyLock::new(|| { + let schema = Schema::new(vec![ + Field::new("int8", DataType::Int8, true), + Field::new("int16", DataType::Int16, true), + Field::new("int32", DataType::Int32, true), + Field::new("int64", DataType::Int64, true), + ]); + + RecordBatch::try_new( + schema.into(), + vec![ + Arc::new(Int8Array::from(vec![10, 20, i8::MAX])), + Arc::new(Int16Array::from(vec![10, 20, i16::MAX])), + Arc::new(Int32Array::from(vec![10, 20, i32::MAX])), + Arc::new(Int64Array::from(vec![10, 20, i64::MAX])), + ], + ) + .unwrap() +}); + +pub static UINT_RECORD_BATCH: LazyLock = LazyLock::new(|| { + let schema = Schema::new(vec![ + Field::new("uint8", DataType::UInt8, true), + Field::new("uint16", DataType::UInt16, true), + Field::new("uint32", DataType::UInt32, true), + Field::new("uint64", DataType::UInt64, true), + ]); + + RecordBatch::try_new( + schema.into(), + vec![ + Arc::new(UInt8Array::from(vec![10, 20, u8::MAX])), + Arc::new(UInt16Array::from(vec![10, 20, u16::MAX])), + Arc::new(UInt32Array::from(vec![10, 20, u32::MAX])), + Arc::new(UInt64Array::from(vec![10, 20, u64::MAX])), + ], + ) + .unwrap() +}); + +pub static DECIMAL_RECORD_BATCH: LazyLock = LazyLock::new(|| { + let schema = Schema::new(vec![ + Field::new( + "decimal128", + DataType::Decimal128(DECIMAL128_MAX_PRECISION, 0), + true, + ), + Field::new( + "decimal128_with_scale", + DataType::Decimal128(DECIMAL128_MAX_PRECISION, 10), + true, + ), + Field::new( + "decimal256", + DataType::Decimal256(DECIMAL256_MAX_PRECISION, 0), + true, + ), + Field::new( + "decimal256_with_scale", + DataType::Decimal256(DECIMAL256_MAX_PRECISION, 10), + true, + ), + ]); + + let decimal_128_array = |scale: i8| { + let mut builder = Decimal128Builder::new() + .with_precision_and_scale(DECIMAL128_MAX_PRECISION, scale) + .unwrap(); + + builder.append_value(10); + builder.append_value(20); + builder.append_value(i128::MAX); + builder.finish() + }; + + let decimal_256_array = |scale: i8| { + let mut builder = Decimal256Builder::new() + .with_precision_and_scale(DECIMAL256_MAX_PRECISION, scale) + .unwrap(); + + builder.append_value(10.into()); + builder.append_value(20.into()); + builder.append_value(i256::MAX); + builder.finish() + }; + + RecordBatch::try_new( + schema.into(), + vec![ + Arc::new(decimal_128_array(0)), + Arc::new(decimal_128_array(10)), + Arc::new(decimal_256_array(0)), + Arc::new(decimal_256_array(10)), + ], + ) + .unwrap() +}); + +pub static FLOAT_RECORD_BATCH: LazyLock = LazyLock::new(|| { + let schema = Schema::new(vec![ + Field::new("float16", DataType::Float16, true), + Field::new("float32", DataType::Float32, true), + Field::new("float64", DataType::Float64, true), + ]); + + RecordBatch::try_new( + schema.into(), + vec![ + Arc::new(Float16Array::from(vec![ + f16::from_f32(10.0), + f16::from_f32(20.0), + f16::MAX, + ])), + Arc::new(Float32Array::from(vec![10.0, 20.0, f32::MAX])), + Arc::new(Float64Array::from(vec![10.0, 20.0, f64::MAX])), + ], + ) + .unwrap() +}); + +pub static STRING_RECORD_BATCH: LazyLock = LazyLock::new(|| { + let schema = Schema::new(vec![ + Field::new("utf8", DataType::Utf8, true), + Field::new("utf8_view", DataType::Utf8View, true), + Field::new("large_utf8", DataType::LargeUtf8, true), + ]); + + RecordBatch::try_new( + schema.into(), + vec![ + Arc::new(StringArray::from(vec!["aa", "bb", "30"])), + Arc::new(StringViewArray::from(vec!["aa", "bb", "30"])), + Arc::new(LargeStringArray::from(vec!["aa", "bb", "30"])), + ], + ) + .unwrap() +}); + +pub static BINARY_RECORD_BATCH: LazyLock = LazyLock::new(|| { + let schema = Schema::new(vec![ + Field::new("binary", DataType::Binary, true), + Field::new("binary_view", DataType::BinaryView, true), + Field::new("fixed_size_binary", DataType::FixedSizeBinary(2), true), + Field::new("fixed_size_binary_32", DataType::FixedSizeBinary(32), true), + Field::new("large_binary", DataType::LargeBinary, true), + ]); + + RecordBatch::try_new( + schema.into(), + vec![ + Arc::new(BinaryArray::from(vec![b"aa".as_ref(), b"bb", b"cc"])), + Arc::new(BinaryViewArray::from(vec![b"aa".as_ref(), b"bb", b"cc"])), + Arc::new(FixedSizeBinaryArray::from(vec![b"aa", b"bb", b"cc"])), + Arc::new(FixedSizeBinaryArray::from(vec![ + &[10; 32], &[20; 32], &[30; 32], + ])), + Arc::new(LargeBinaryArray::from(vec![b"aa".as_ref(), b"bb", b"cc"])), + ], + ) + .unwrap() +}); + +pub static TIMESTAMP_RECORD_BATCH: LazyLock = LazyLock::new(|| { + let schema = Schema::new(vec![ + Field::new( + "timestamp_second", + DataType::Timestamp(TimeUnit::Second, None), + true, + ), + Field::new( + "timestamp_millisecond", + DataType::Timestamp(TimeUnit::Millisecond, None), + true, + ), + Field::new( + "timestamp_microsecond", + DataType::Timestamp(TimeUnit::Microsecond, None), + true, + ), + Field::new( + "timestamp_nanosecond", + DataType::Timestamp(TimeUnit::Nanosecond, None), + true, + ), + ]); + + let date_time_one = Utc.with_ymd_and_hms(2020, 1, 1, 0, 0, 0).unwrap(); + let date_time_two = Utc.with_ymd_and_hms(2020, 10, 10, 10, 10, 10).unwrap(); + let date_time_three = Utc.with_ymd_and_hms(2020, 12, 31, 23, 59, 59).unwrap(); + + RecordBatch::try_new( + schema.into(), + vec![ + Arc::new(TimestampSecondArray::from(vec![ + date_time_one.timestamp(), + date_time_two.timestamp(), + date_time_three.timestamp(), + ])), + Arc::new(TimestampMillisecondArray::from(vec![ + date_time_one.timestamp_millis(), + date_time_two.timestamp_millis(), + date_time_three.timestamp_millis(), + ])), + Arc::new(TimestampMicrosecondArray::from(vec![ + date_time_one.timestamp_micros(), + date_time_two.timestamp_micros(), + date_time_three.timestamp_micros(), + ])), + Arc::new(TimestampNanosecondArray::from(vec![ + date_time_one.timestamp_nanos_opt().unwrap(), + date_time_two.timestamp_nanos_opt().unwrap(), + date_time_three.timestamp_nanos_opt().unwrap(), + ])), + ], + ) + .unwrap() +}); + +#[test] +fn record_batch_is_valid() { + let _schema = BOOLEAN_RECORD_BATCH.schema(); + let _schema = INT_RECORD_BATCH.schema(); + let _schema = UINT_RECORD_BATCH.schema(); + let _schema = DECIMAL_RECORD_BATCH.schema(); + let _schema = FLOAT_RECORD_BATCH.schema(); + let _schema = STRING_RECORD_BATCH.schema(); + let _schema = BINARY_RECORD_BATCH.schema(); + let _schema = TIMESTAMP_RECORD_BATCH.schema(); + + let _schema = RECORD_BATCH.schema(); +} diff --git a/graph/src/amp/codec/value_decoder.rs b/graph/src/amp/codec/value_decoder.rs index 99fa3969d67..c6e4e7162a2 100644 --- a/graph/src/amp/codec/value_decoder.rs +++ b/graph/src/amp/codec/value_decoder.rs @@ -86,7 +86,17 @@ fn list_value_decoder<'a>( Ok(mapping_decoder(list_decoder, Value::List)) } - data_type => Err(anyhow!("'{data_type}' is not a supported list type")), + _ => { + let decoder = single_value_decoder(value_type, array)?; + + Ok(Box::new(MappingDecoder::new(decoder, |value| { + if matches!(value, Value::Null) { + return Value::Null; + } + + Value::List(vec![value]) + }))) + } } } @@ -119,12 +129,20 @@ fn single_value_decoder<'a>( let integer_decoder = integer_decoder::>(array)?; mapping_decoder(integer_decoder, Value::BigInt) } + (ValueType::BigInt, data_type) if is_string(data_type) => { + let string_decoder = string_decoder::>(array)?; + mapping_decoder(string_decoder, Value::BigInt) + } (ValueType::BigInt, _) => return incompatible_types_err(), (ValueType::BigDecimal, data_type) if is_decimal(data_type) => { let decimal_decoder = decimal_decoder::>(array)?; mapping_decoder(decimal_decoder, Value::BigDecimal) } + (ValueType::BigDecimal, data_type) if is_string(data_type) => { + let string_decoder = string_decoder::>(array)?; + mapping_decoder(string_decoder, Value::BigDecimal) + } (ValueType::BigDecimal, _) => return incompatible_types_err(), (ValueType::Bytes, data_type) if is_binary(data_type) => { @@ -348,3 +366,508 @@ where Ok(array_decoder) } + +#[cfg(test)] +mod tests { + use super::super::test_fixtures::*; + use super::*; + + mod boolean_value_decoder { + use super::*; + + fn decoder(column_name: &str, is_list: bool) -> Box> { + value_decoder( + ValueType::Boolean, + is_list, + RECORD_BATCH.column_by_name(column_name).unwrap(), + ) + .unwrap() + } + + #[test] + fn decode_single_values() { + let decoder = decoder("boolean", false); + + assert_eq!(decoder.decode(0).unwrap(), Value::Bool(true)); + assert_eq!(decoder.decode(1).unwrap(), Value::Bool(false)); + assert_eq!(decoder.decode(2).unwrap(), Value::Bool(true)); + assert_eq!(decoder.decode(3).unwrap(), Value::Null); + } + + #[test] + fn decode_single_values_as_lists() { + let decoder = decoder("boolean", true); + + assert_eq!( + decoder.decode(0).unwrap(), + Value::List(vec![Value::Bool(true)]) + ); + assert_eq!( + decoder.decode(1).unwrap(), + Value::List(vec![Value::Bool(false)]) + ); + assert_eq!( + decoder.decode(2).unwrap(), + Value::List(vec![Value::Bool(true)]) + ); + assert_eq!(decoder.decode(3).unwrap(), Value::Null); + } + + #[test] + fn decode_list_values() { + let decoder = decoder("boolean_list", true); + + assert_eq!( + decoder.decode(0).unwrap(), + Value::List(vec![ + Value::Bool(true), + Value::Bool(false), + Value::Bool(true), + ]) + ); + assert_eq!(decoder.decode(1).unwrap(), Value::Null); + } + + #[test] + fn decode_list_view_values() { + let decoder = decoder("boolean_list_view", true); + + assert_eq!( + decoder.decode(0).unwrap(), + Value::List(vec![ + Value::Bool(true), + Value::Bool(false), + Value::Bool(true), + ]) + ); + assert_eq!(decoder.decode(1).unwrap(), Value::Null); + } + + #[test] + fn decode_fixed_size_list_values() { + let decoder = decoder("boolean_fixed_size_list", true); + + assert_eq!( + decoder.decode(0).unwrap(), + Value::List(vec![ + Value::Bool(true), + Value::Bool(false), + Value::Bool(true), + ]) + ); + } + + #[test] + fn decode_large_list_values() { + let decoder = decoder("boolean_large_list", true); + + assert_eq!( + decoder.decode(0).unwrap(), + Value::List(vec![ + Value::Bool(true), + Value::Bool(false), + Value::Bool(true), + ]) + ); + assert_eq!(decoder.decode(1).unwrap(), Value::Null); + } + + #[test] + fn decode_large_list_view_values() { + let decoder = decoder("boolean_large_list_view", true); + + assert_eq!( + decoder.decode(0).unwrap(), + Value::List(vec![ + Value::Bool(true), + Value::Bool(false), + Value::Bool(true), + ]) + ); + assert_eq!(decoder.decode(1).unwrap(), Value::Null); + } + + #[test] + fn fail_to_decode_values_of_other_types() { + value_decoder(ValueType::Boolean, false, BINARY_RECORD_BATCH.column(0)) + .map(|_| ()) + .unwrap_err(); + } + } + + mod int_value_decoder { + use super::*; + + fn decoder(column_name: &str) -> Box> { + value_decoder( + ValueType::Int, + false, + RECORD_BATCH.column_by_name(column_name).unwrap(), + ) + .unwrap() + } + + #[test] + fn decode_values() { + for column in [ + "int8", + "int16", + "int32", + "int64", + "uint8", + "uint16", + "uint32", + "uint64", + "decimal128", + "decimal256", + ] { + let decoder = decoder(column); + + assert_eq!(decoder.decode(0).unwrap(), Value::Int(10)); + assert_eq!(decoder.decode(1).unwrap(), Value::Int(20)); + assert_eq!(decoder.decode(3).unwrap(), Value::Null); + } + } + + #[test] + fn fail_to_decode_values_of_other_types() { + value_decoder(ValueType::Int, false, BOOLEAN_RECORD_BATCH.column(0)) + .map(|_| ()) + .unwrap_err(); + } + } + + mod int8_value_decoder { + use super::*; + + fn decoder(column_name: &str) -> Box> { + value_decoder( + ValueType::Int8, + false, + RECORD_BATCH.column_by_name(column_name).unwrap(), + ) + .unwrap() + } + + #[test] + fn decode_values() { + for column in [ + "int8", + "int16", + "int32", + "int64", + "uint8", + "uint16", + "uint32", + "uint64", + "decimal128", + "decimal256", + ] { + let decoder = decoder(column); + + assert_eq!(decoder.decode(0).unwrap(), Value::Int8(10)); + assert_eq!(decoder.decode(1).unwrap(), Value::Int8(20)); + assert_eq!(decoder.decode(3).unwrap(), Value::Null); + } + } + + #[test] + fn fail_to_decode_values_of_other_types() { + value_decoder(ValueType::Int8, false, BOOLEAN_RECORD_BATCH.column(0)) + .map(|_| ()) + .unwrap_err(); + } + } + + mod big_int_value_decoder { + use super::*; + + fn decoder(column_name: &str) -> Box> { + value_decoder( + ValueType::BigInt, + false, + RECORD_BATCH.column_by_name(column_name).unwrap(), + ) + .unwrap() + } + + #[test] + fn decode_values() { + for column in [ + "int8", + "int16", + "int32", + "int64", + "uint8", + "uint16", + "uint32", + "uint64", + "decimal128", + "decimal256", + ] { + let decoder = decoder(column); + + assert_eq!(decoder.decode(0).unwrap(), Value::BigInt(BigInt::from(10))); + assert_eq!(decoder.decode(1).unwrap(), Value::BigInt(BigInt::from(20))); + assert_eq!(decoder.decode(3).unwrap(), Value::Null); + } + } + + #[test] + fn decode_values_from_numerical_strings() { + for column in ["utf8", "utf8_view", "large_utf8"] { + let decoder = decoder(column); + + assert_eq!(decoder.decode(2).unwrap(), Value::BigInt(BigInt::from(30))); + assert_eq!(decoder.decode(3).unwrap(), Value::Null); + } + } + + #[test] + fn fail_to_decode_values_from_non_numerical_strings() { + for column in ["utf8", "utf8_view", "large_utf8"] { + let decoder = decoder(column); + + decoder.decode(0).unwrap_err(); + } + } + + #[test] + fn fail_to_decode_values_of_other_types() { + value_decoder(ValueType::BigInt, false, BOOLEAN_RECORD_BATCH.column(0)) + .map(|_| ()) + .unwrap_err(); + } + } + + mod big_decimal_value_decoder { + use super::*; + + fn decoder(column_name: &str) -> Box> { + value_decoder( + ValueType::BigDecimal, + false, + RECORD_BATCH.column_by_name(column_name).unwrap(), + ) + .unwrap() + } + + #[test] + fn decode_values() { + for column in ["float16", "float32", "float64", "decimal128", "decimal256"] { + let decoder = decoder(column); + + assert_eq!( + decoder.decode(0).unwrap(), + Value::BigDecimal(BigDecimal::from(10.0)) + ); + assert_eq!( + decoder.decode(1).unwrap(), + Value::BigDecimal(BigDecimal::from(20.0)) + ); + assert_eq!(decoder.decode(3).unwrap(), Value::Null); + } + } + + #[test] + fn decode_values_from_numerical_strings() { + for column in ["utf8", "utf8_view", "large_utf8"] { + let decoder = decoder(column); + + assert_eq!( + decoder.decode(2).unwrap(), + Value::BigDecimal(BigDecimal::from(30.0)) + ); + assert_eq!(decoder.decode(3).unwrap(), Value::Null); + } + } + + #[test] + fn fail_to_decode_values_from_non_numerical_strings() { + for column in ["utf8", "utf8_view", "large_utf8"] { + let decoder = decoder(column); + + decoder.decode(0).unwrap_err(); + } + } + + #[test] + fn fail_to_decode_values_of_other_types() { + value_decoder(ValueType::BigDecimal, false, BOOLEAN_RECORD_BATCH.column(0)) + .map(|_| ()) + .unwrap_err(); + } + } + + mod bytes_value_decoder { + use super::*; + + fn decoder(column_name: &str) -> Box> { + value_decoder( + ValueType::Bytes, + false, + RECORD_BATCH.column_by_name(column_name).unwrap(), + ) + .unwrap() + } + + #[test] + fn decode_values() { + for column in ["binary", "binary_view", "fixed_size_binary", "large_binary"] { + let decoder = decoder(column); + + assert_eq!( + decoder.decode(0).unwrap(), + Value::Bytes(b"aa".as_slice().into()) + ); + assert_eq!( + decoder.decode(1).unwrap(), + Value::Bytes(b"bb".as_slice().into()) + ); + assert_eq!( + decoder.decode(2).unwrap(), + Value::Bytes(b"cc".as_slice().into()) + ); + assert_eq!(decoder.decode(3).unwrap(), Value::Null); + } + } + + #[test] + fn fail_to_decode_values_of_other_types() { + value_decoder(ValueType::Bytes, false, BOOLEAN_RECORD_BATCH.column(0)) + .map(|_| ()) + .unwrap_err(); + } + } + + mod string_value_decoder { + use super::*; + + fn decoder(column_name: &str) -> Box> { + value_decoder( + ValueType::String, + false, + RECORD_BATCH.column_by_name(column_name).unwrap(), + ) + .unwrap() + } + + #[test] + fn decode_values_from_strings() { + for column in ["utf8", "utf8_view", "large_utf8"] { + let decoder = decoder(column); + + assert_eq!(decoder.decode(0).unwrap(), Value::String("aa".to_string())); + assert_eq!(decoder.decode(1).unwrap(), Value::String("bb".to_string())); + assert_eq!(decoder.decode(2).unwrap(), Value::String("30".to_string())); + assert_eq!(decoder.decode(3).unwrap(), Value::Null); + } + } + + #[test] + fn decode_values_from_numbers() { + for column in [ + "int8", + "int16", + "int32", + "int64", + "uint8", + "uint16", + "uint32", + "uint64", + "decimal128", + "decimal256", + ] { + let decoder = decoder(column); + + assert_eq!(decoder.decode(0).unwrap(), Value::String("10".to_string())); + assert_eq!(decoder.decode(1).unwrap(), Value::String("20".to_string())); + assert_eq!(decoder.decode(3).unwrap(), Value::Null); + } + } + + #[test] + fn decode_values_from_bytes() { + for column in ["binary", "binary_view", "fixed_size_binary", "large_binary"] { + let decoder = decoder(column); + + assert_eq!( + decoder.decode(0).unwrap(), + Value::String(format!("0x{}", hex::encode(b"aa"))) + ); + assert_eq!( + decoder.decode(1).unwrap(), + Value::String(format!("0x{}", hex::encode(b"bb"))) + ); + assert_eq!( + decoder.decode(2).unwrap(), + Value::String(format!("0x{}", hex::encode(b"cc"))) + ); + assert_eq!(decoder.decode(3).unwrap(), Value::Null); + } + } + + #[test] + fn fail_to_decode_values_of_other_types() { + value_decoder(ValueType::String, false, BOOLEAN_RECORD_BATCH.column(0)) + .map(|_| ()) + .unwrap_err(); + } + } + + mod timestamp_value_decoder { + use chrono::{TimeZone, Utc}; + + use super::*; + + fn decoder(column_name: &str) -> Box> { + value_decoder( + ValueType::Timestamp, + false, + RECORD_BATCH.column_by_name(column_name).unwrap(), + ) + .unwrap() + } + + #[test] + fn decode_values() { + for column in [ + "timestamp_second", + "timestamp_millisecond", + "timestamp_microsecond", + "timestamp_nanosecond", + ] { + let decoder = decoder(column); + + assert_eq!( + decoder.decode(0).unwrap(), + Value::Timestamp(Utc.with_ymd_and_hms(2020, 1, 1, 0, 0, 0).unwrap().into()) + ); + assert_eq!( + decoder.decode(1).unwrap(), + Value::Timestamp( + Utc.with_ymd_and_hms(2020, 10, 10, 10, 10, 10) + .unwrap() + .into() + ) + ); + assert_eq!( + decoder.decode(2).unwrap(), + Value::Timestamp( + Utc.with_ymd_and_hms(2020, 12, 31, 23, 59, 59) + .unwrap() + .into() + ) + ); + assert_eq!(decoder.decode(3).unwrap(), Value::Null); + } + } + + #[test] + fn fail_to_decode_values_of_other_types() { + value_decoder(ValueType::Timestamp, false, BOOLEAN_RECORD_BATCH.column(0)) + .map(|_| ()) + .unwrap_err(); + } + } +} diff --git a/graph/src/data/store/scalar/bigdecimal.rs b/graph/src/data/store/scalar/bigdecimal.rs index b8b62f573fb..65738563a67 100644 --- a/graph/src/data/store/scalar/bigdecimal.rs +++ b/graph/src/data/store/scalar/bigdecimal.rs @@ -138,12 +138,24 @@ impl From for BigDecimal { } } +impl From for BigDecimal { + fn from(n: i128) -> Self { + Self::from(OldBigDecimal::new(BigInt::from(n).inner(), 0)) + } +} + impl From for BigDecimal { fn from(n: u64) -> Self { Self::from(OldBigDecimal::from(n)) } } +impl From for BigDecimal { + fn from(n: f32) -> Self { + Self::from(OldBigDecimal::from_f32(n).unwrap_or_default()) + } +} + impl From for BigDecimal { fn from(n: f64) -> Self { Self::from(OldBigDecimal::from_f64(n).unwrap_or_default()) diff --git a/graph/src/data/store/scalar/bigint.rs b/graph/src/data/store/scalar/bigint.rs index c344ec83a6d..554aac83d6b 100644 --- a/graph/src/data/store/scalar/bigint.rs +++ b/graph/src/data/store/scalar/bigint.rs @@ -224,14 +224,20 @@ impl BigInt { } } -impl From for BigInt { - fn from(i: i32) -> BigInt { +impl From for BigInt { + fn from(i: i8) -> BigInt { BigInt::unchecked_new(i.into()) } } -impl From for BigInt { - fn from(i: u64) -> BigInt { +impl From for BigInt { + fn from(i: i16) -> BigInt { + BigInt::unchecked_new(i.into()) + } +} + +impl From for BigInt { + fn from(i: i32) -> BigInt { BigInt::unchecked_new(i.into()) } } @@ -242,6 +248,36 @@ impl From for BigInt { } } +impl From for BigInt { + fn from(i: i128) -> BigInt { + BigInt::unchecked_new(i.into()) + } +} + +impl From for BigInt { + fn from(i: u8) -> BigInt { + BigInt::unchecked_new(i.into()) + } +} + +impl From for BigInt { + fn from(i: u16) -> BigInt { + BigInt::unchecked_new(i.into()) + } +} + +impl From for BigInt { + fn from(i: u32) -> BigInt { + BigInt::unchecked_new(i.into()) + } +} + +impl From for BigInt { + fn from(i: u64) -> BigInt { + BigInt::unchecked_new(i.into()) + } +} + impl From for BigInt { /// This implementation assumes that U64 represents an unsigned U64, /// and not a signed U64 (aka int64 in Solidity). Right now, this is diff --git a/graph/src/data/store/scalar/timestamp.rs b/graph/src/data/store/scalar/timestamp.rs index 02769d4adf8..58b2ef10cb8 100644 --- a/graph/src/data/store/scalar/timestamp.rs +++ b/graph/src/data/store/scalar/timestamp.rs @@ -90,6 +90,12 @@ impl stable_hash_legacy::StableHash for Timestamp { } } +impl From> for Timestamp { + fn from(value: DateTime) -> Self { + Self(value) + } +} + impl Display for Timestamp { fn fmt(&self, f: &mut Formatter) -> Result<(), fmt::Error> { write!(f, "{}", self.as_microseconds_since_epoch()) From c57a959a23e61dff79ea98bed206bfb1eeed0900 Mon Sep 17 00:00:00 2001 From: Ion Suman <47307091+isum@users.noreply.github.com> Date: Tue, 18 Nov 2025 10:36:37 -0300 Subject: [PATCH 33/40] feat(core, graph): add Amp subgraph metrics --- Cargo.lock | 11 + Cargo.toml | 2 + core/Cargo.toml | 2 + core/src/amp_subgraph/metrics.rs | 236 +++++++++++++++++- core/src/amp_subgraph/runner/context.rs | 9 + .../amp_subgraph/runner/data_processing.rs | 33 ++- core/src/amp_subgraph/runner/data_stream.rs | 37 ++- core/src/amp_subgraph/runner/latest_blocks.rs | 2 + core/src/amp_subgraph/runner/mod.rs | 47 +++- core/src/amp_subgraph/runner/reorg_handler.rs | 2 + graph/Cargo.toml | 2 +- graph/src/components/metrics/registry.rs | 23 +- 12 files changed, 380 insertions(+), 26 deletions(-) diff --git a/Cargo.lock b/Cargo.lock index 6dedcee131a..f1af838abc8 100644 --- a/Cargo.lock +++ b/Cargo.lock @@ -2879,8 +2879,10 @@ dependencies = [ "graph-chain-near", "graph-chain-substreams", "graph-runtime-wasm", + "indoc", "itertools", "parking_lot", + "prometheus", "serde_yaml", "slog", "strum", @@ -3832,6 +3834,15 @@ dependencies = [ "serde_core", ] +[[package]] +name = "indoc" +version = "2.0.7" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "79cf5c93f93228cf8efb3ba362535fb11199ac548a09ce117c9b1adc3030d706" +dependencies = [ + "rustversion", +] + [[package]] name = "inotify" version = "0.11.0" diff --git a/Cargo.toml b/Cargo.toml index bc7e59156a2..7cd023fcccb 100644 --- a/Cargo.toml +++ b/Cargo.toml @@ -97,6 +97,7 @@ substreams = "=0.6.0" substreams-entity-change = "2" substreams-near-core = "=0.10.2" rand = { version = "0.9.2", features = ["os_rng"] } +prometheus = "0.14.0" # Dependencies related to Amp subgraphs ahash = "0.8.11" @@ -105,6 +106,7 @@ arrow = { version = "=55.0.0" } arrow-flight = { version = "=55.0.0", features = ["flight-sql-experimental"] } futures = "0.3.31" half = "2.7.1" +indoc = "2.0.7" lazy-regex = "3.4.1" parking_lot = "0.12.4" sqlparser-latest = { version = "0.57.0", package = "sqlparser", features = ["visitor"] } diff --git a/core/Cargo.toml b/core/Cargo.toml index bbe5695712f..28afc1079c6 100644 --- a/core/Cargo.toml +++ b/core/Cargo.toml @@ -24,8 +24,10 @@ alloy.workspace = true arrow.workspace = true chrono.workspace = true futures.workspace = true +indoc.workspace = true itertools.workspace = true parking_lot.workspace = true +prometheus.workspace = true slog.workspace = true strum.workspace = true tokio-util.workspace = true diff --git a/core/src/amp_subgraph/metrics.rs b/core/src/amp_subgraph/metrics.rs index f89e85acfdd..1e74a4bcb9a 100644 --- a/core/src/amp_subgraph/metrics.rs +++ b/core/src/amp_subgraph/metrics.rs @@ -1,17 +1,26 @@ -use std::sync::Arc; +use std::{sync::Arc, time::Duration}; +use alloy::primitives::BlockNumber; use graph::{ cheap_clone::CheapClone, components::{ metrics::{stopwatch::StopwatchMetrics, MetricsRegistry}, store::WritableStore, }, - data::subgraph::DeploymentHash, + prelude::DeploymentHash, }; +use indoc::indoc; +use prometheus::{IntCounter, IntGauge}; use slog::Logger; -/// Contains deployment specific metrics. +/// Contains metrics specific to a deployment. pub(super) struct Metrics { + pub(super) deployment_status: DeploymentStatus, + pub(super) deployment_head: DeploymentHead, + pub(super) deployment_target: DeploymentTarget, + pub(super) deployment_synced: DeploymentSynced, + pub(super) indexing_duration: IndexingDuration, + pub(super) blocks_processed: BlocksProcessed, pub(super) stopwatch: StopwatchMetrics, } @@ -25,12 +34,227 @@ impl Metrics { ) -> Self { let stopwatch = StopwatchMetrics::new( logger.cheap_clone(), - deployment, + deployment.cheap_clone(), "amp-process", - metrics_registry, + metrics_registry.cheap_clone(), store.shard().to_string(), ); - Self { stopwatch } + let const_labels = [("deployment", &deployment)]; + + Self { + deployment_status: DeploymentStatus::new(&metrics_registry, const_labels.clone()), + deployment_head: DeploymentHead::new(&metrics_registry, const_labels.clone()), + deployment_target: DeploymentTarget::new(&metrics_registry, const_labels.clone()), + deployment_synced: DeploymentSynced::new(&metrics_registry, const_labels.clone()), + indexing_duration: IndexingDuration::new(&metrics_registry, const_labels.clone()), + blocks_processed: BlocksProcessed::new(&metrics_registry, const_labels.clone()), + stopwatch, + } + } +} + +/// Reports the current indexing status of a deployment. +pub(super) struct DeploymentStatus(IntGauge); + +impl DeploymentStatus { + const STATUS_STARTING: i64 = 1; + const STATUS_RUNNING: i64 = 2; + const STATUS_STOPPED: i64 = 3; + const STATUS_FAILED: i64 = 4; + + fn new( + metrics_registry: &MetricsRegistry, + const_labels: impl IntoIterator, + ) -> Self { + let int_gauge = metrics_registry + .new_int_gauge( + "amp_deployment_status", + indoc!( + " + Indicates the current indexing status of a deployment. + Possible values: + 1 - graph-node is preparing to start indexing; + 2 - deployment is being indexed; + 3 - indexing is stopped by request; + 4 - indexing failed; + " + ), + const_labels, + ) + .expect("failed to register `amp_deployment_status` gauge"); + + Self(int_gauge) + } + + /// Records that the graph-node is preparing to start indexing. + pub fn starting(&self) { + self.0.set(Self::STATUS_STARTING); + } + + /// Records that the deployment is being indexed. + pub fn running(&self) { + self.0.set(Self::STATUS_RUNNING); + } + + /// Records that the indexing stopped by request. + pub fn stopped(&self) { + self.0.set(Self::STATUS_STOPPED); + } + + /// Records that the indexing failed. + pub fn failed(&self) { + self.0.set(Self::STATUS_FAILED); + } +} + +/// Tracks the most recent block number processed by a deployment. +pub(super) struct DeploymentHead(IntGauge); + +impl DeploymentHead { + fn new( + metrics_registry: &MetricsRegistry, + const_labels: impl IntoIterator, + ) -> Self { + let int_gauge = metrics_registry + .new_int_gauge( + "amp_deployment_head", + "Tracks the most recent block number processed by a deployment", + const_labels, + ) + .expect("failed to register `amp_deployment_head` gauge"); + + Self(int_gauge) + } + + /// Updates the most recent block number processed by this deployment. + pub(super) fn update(&self, new_most_recent_block_number: BlockNumber) { + self.0.set( + i64::try_from(new_most_recent_block_number) + .expect("new most recent block number does not fit into `i64`"), + ); + } +} + +/// Tracks the target block number of a deployment. +pub(super) struct DeploymentTarget(IntGauge); + +impl DeploymentTarget { + fn new( + metrics_registry: &MetricsRegistry, + const_labels: impl IntoIterator, + ) -> Self { + let int_gauge = metrics_registry + .new_int_gauge( + "amp_deployment_target", + "Tracks the target block number of a deployment", + const_labels, + ) + .expect("failed to register `amp_deployment_target` gauge"); + + Self(int_gauge) + } + + /// Updates the target block number of this deployment. + pub(super) fn update(&self, new_target_block_number: BlockNumber) { + self.0.set( + i64::try_from(new_target_block_number) + .expect("new target block number does not fit into `i64`"), + ); + } +} + +/// Indicates whether a deployment has reached the chain head or the end block since it was deployed. +pub(super) struct DeploymentSynced(IntGauge); + +impl DeploymentSynced { + const NOT_SYNCED: i64 = 0; + const SYNCED: i64 = 1; + + pub fn new( + metrics_registry: &MetricsRegistry, + const_labels: impl IntoIterator, + ) -> Self { + let int_gauge = metrics_registry + .new_int_gauge( + "amp_deployment_synced", + indoc!( + " + Indicates whether a deployment has reached the chain head or the end block since it was deployed. + Possible values: + 0 - deployment is not synced; + 1 - deployment is synced; + " + ), + const_labels, + ) + .expect("failed to register `amp_deployment_synced` gauge"); + + Self(int_gauge) + } + + /// Records the current sync status of this deployment. + pub fn record(&self, synced: bool) { + self.0.set(if synced { + Self::SYNCED + } else { + Self::NOT_SYNCED + }); + } +} + +/// Tracks the total duration in seconds of deployment indexing. +#[derive(Clone)] +pub(super) struct IndexingDuration(IntCounter); + +impl IndexingDuration { + fn new( + metrics_registry: &MetricsRegistry, + const_labels: impl IntoIterator, + ) -> Self { + let int_counter = metrics_registry + .new_int_counter( + "amp_deployment_indexing_duration_seconds", + "Tracks the total duration in seconds of deployment indexing", + const_labels, + ) + .expect("failed to register `amp_deployment_indexing_duration_seconds` counter"); + + Self(int_counter) + } + + /// Records a new indexing duration of this deployment. + pub(super) fn record(&self, duration: Duration) { + self.0.inc_by(duration.as_secs()) + } +} + +/// Tracks the total number of blocks processed by a deployment. +pub(super) struct BlocksProcessed(IntCounter); + +impl BlocksProcessed { + fn new( + metrics_registry: &MetricsRegistry, + const_labels: impl IntoIterator, + ) -> Self { + let int_counter = metrics_registry + .new_int_counter( + "amp_deployment_blocks_processed_count", + "Tracks the total number of blocks processed by a deployment", + const_labels, + ) + .expect("failed to register `amp_deployment_blocks_processed_count` counter"); + + Self(int_counter) + } + + /// Records a new processed block. + pub(super) fn record_one(&self) { + self.record(1); + } + + /// Records the new processed blocks. + pub(super) fn record(&self, number_of_blocks_processed: usize) { + self.0.inc_by(number_of_blocks_processed as u64); } } diff --git a/core/src/amp_subgraph/runner/context.rs b/core/src/amp_subgraph/runner/context.rs index 2dbb44f8bb8..32e96148acf 100644 --- a/core/src/amp_subgraph/runner/context.rs +++ b/core/src/amp_subgraph/runner/context.rs @@ -93,4 +93,13 @@ impl Context { .min() .unwrap() } + + pub(super) fn max_end_block(&self) -> BlockNumber { + self.manifest + .data_sources + .iter() + .map(|data_source| data_source.source.end_block) + .max() + .unwrap() + } } diff --git a/core/src/amp_subgraph/runner/data_processing.rs b/core/src/amp_subgraph/runner/data_processing.rs index eb7f303e367..2801abc66aa 100644 --- a/core/src/amp_subgraph/runner/data_processing.rs +++ b/core/src/amp_subgraph/runner/data_processing.rs @@ -24,17 +24,24 @@ pub(super) async fn process_record_batch_groups( stream_table_ptr: Arc<[TablePtr]>, latest_block: BlockNumber, ) -> Result { + if record_batch_groups.is_empty() { + debug!(cx.logger, "Received no record batch groups"); + return Ok(entity_cache); + } + let from_block = record_batch_groups .first_key_value() - .map(|((block, _), _)| *block); + .map(|((block, _), _)| *block) + .unwrap(); let to_block = record_batch_groups .last_key_value() - .map(|((block, _), _)| *block); + .map(|((block, _), _)| *block) + .unwrap(); debug!(cx.logger, "Processing record batch groups"; - "from_block" => ?from_block, - "to_block" => ?to_block + "from_block" => from_block, + "to_block" => to_block ); for ((block_number, block_hash), record_batch_group) in record_batch_groups { @@ -59,14 +66,17 @@ pub(super) async fn process_record_batch_groups( )) })?; + cx.metrics.deployment_head.update(block_number); + cx.metrics.blocks_processed.record_one(); + trace!(cx.logger, "Completed processing record batch group"; "block" => block_number ); } debug!(cx.logger, "Completed processing record batch groups"; - "from_block" => ?from_block, - "to_block" => ?to_block + "from_block" => from_block, + "to_block" => to_block ); Ok(entity_cache) @@ -81,6 +91,11 @@ async fn process_record_batch_group( stream_table_ptr: &[TablePtr], latest_block: BlockNumber, ) -> Result { + let _section = cx + .metrics + .stopwatch + .start_section("process_record_batch_group"); + let RecordBatchGroup { record_batches } = record_batch_group; if record_batches.is_empty() { @@ -140,6 +155,10 @@ async fn process_record_batch_group( .map_err(Error::from) .map_err(|e| e.context("failed to transact block operations"))?; + if is_close_to_chain_head { + cx.metrics.deployment_synced.record(true); + } + Ok(EntityCache::with_current( cx.store.cheap_clone(), entity_lfu_cache, @@ -153,6 +172,8 @@ async fn process_record_batch( record_batch: RecordBatch, (i, j): TablePtr, ) -> Result<(), Error> { + let _section = cx.metrics.stopwatch.start_section("process_record_batch"); + let table = &cx.manifest.data_sources[i].transformer.tables[j]; let entity_name = &table.name; diff --git a/core/src/amp_subgraph/runner/data_stream.rs b/core/src/amp_subgraph/runner/data_stream.rs index c96aa0b1d05..7f3636a5af9 100644 --- a/core/src/amp_subgraph/runner/data_stream.rs +++ b/core/src/amp_subgraph/runner/data_stream.rs @@ -71,9 +71,10 @@ where for (j, table) in data_source.transformer.tables.iter().enumerate() { let query = table.query.build_with_block_range(block_range); + let stream = cx.client.query(&cx.logger, query, None); let stream_name = format!("{}.{}", data_source.name, table.name); - query_streams.push((stream_name, cx.client.query(&cx.logger, query, None))); + query_streams.push((stream_name, stream)); query_streams_table_ptr.push((i, j)); } } @@ -81,18 +82,40 @@ where let query_streams_table_ptr: Arc<[TablePtr]> = query_streams_table_ptr.into(); total_queries_to_execute += query_streams.len(); + let mut min_start_block_checked = false; + let mut load_first_record_batch_group_section = Some( + cx.metrics + .stopwatch + .start_section("load_first_record_batch_group"), + ); + data_streams.push( StreamAggregator::new(&cx.logger, query_streams, cx.max_buffer_size) .map_ok(move |response| (response, query_streams_table_ptr.cheap_clone())) .map_err(Error::from) - .and_then(move |response| async move { - if let Some(((first_block, _), _)) = response.0.first_key_value() { - if *first_block < min_start_block { - return Err(Error::NonDeterministic(anyhow!("chain reorg"))); - } + .map(move |result| { + if load_first_record_batch_group_section.is_some() { + let _section = load_first_record_batch_group_section.take(); } - Ok(response) + match result { + Ok(response) => { + if !min_start_block_checked { + if let Some(((first_block, _), _)) = response.0.first_key_value() { + if *first_block < min_start_block { + return Err(Error::NonDeterministic(anyhow!( + "chain reorg" + ))); + } + } + + min_start_block_checked = true; + } + + Ok(response) + } + Err(e) => Err(e), + } }) .boxed(), ); diff --git a/core/src/amp_subgraph/runner/latest_blocks.rs b/core/src/amp_subgraph/runner/latest_blocks.rs index 359d89aee07..559aef963cd 100644 --- a/core/src/amp_subgraph/runner/latest_blocks.rs +++ b/core/src/amp_subgraph/runner/latest_blocks.rs @@ -24,6 +24,7 @@ impl LatestBlocks { AC: Client, { debug!(cx.logger, "Loading latest blocks"); + let _section = cx.metrics.stopwatch.start_section("load_latest_blocks"); let latest_block_futs = cx .manifest @@ -82,6 +83,7 @@ impl LatestBlocks { AC: Client, { debug!(cx.logger, "Waiting for new blocks"); + let _section = cx.metrics.stopwatch.start_section("latest_blocks_changed"); let min_latest_block = self.min(); let latest_synced_block = cx.latest_synced_block(); diff --git a/core/src/amp_subgraph/runner/mod.rs b/core/src/amp_subgraph/runner/mod.rs index b7e65d62851..8fee0e9fda4 100644 --- a/core/src/amp_subgraph/runner/mod.rs +++ b/core/src/amp_subgraph/runner/mod.rs @@ -6,6 +6,8 @@ mod error; mod latest_blocks; mod reorg_handler; +use std::time::{Duration, Instant}; + use anyhow::Result; use futures::{future::BoxFuture, StreamExt}; use graph::{ @@ -30,16 +32,35 @@ where { Box::new(move |cancel_token| { Box::pin(async move { - match cancel_token + let indexing_duration_handle = tokio::spawn({ + let mut instant = Instant::now(); + let indexing_duration = cx.metrics.indexing_duration.clone(); + + async move { + loop { + tokio::time::sleep(Duration::from_secs(1)).await; + + let prev_instant = std::mem::replace(&mut instant, Instant::now()); + indexing_duration.record(prev_instant.elapsed()); + } + } + }); + + let result = cancel_token .run_until_cancelled(run_indexing_with_retries(&mut cx)) - .await - { + .await; + + indexing_duration_handle.abort(); + + match result { Some(result) => result?, None => { debug!(cx.logger, "Processed cancel signal"); } } + cx.metrics.deployment_status.stopped(); + debug!(cx.logger, "Waiting for the store to finish processing"); cx.store.flush().await?; Ok(()) @@ -51,7 +72,19 @@ async fn run_indexing(cx: &mut Context) -> Result<(), Error> where AC: Client, { + cx.metrics.deployment_status.starting(); + + if let Some(latest_synced_block) = cx.latest_synced_block() { + cx.metrics.deployment_head.update(latest_synced_block); + } + + cx.metrics + .deployment_synced + .record(cx.store.is_deployment_synced()); + loop { + cx.metrics.deployment_status.running(); + debug!(cx.logger, "Running indexing"; "latest_synced_block_ptr" => ?cx.latest_synced_block_ptr() ); @@ -60,6 +93,8 @@ where check_and_handle_reorg(cx, &latest_blocks).await?; if cx.indexing_completed() { + cx.metrics.deployment_synced.record(true); + debug!(cx.logger, "Indexing completed"); return Ok(()); } @@ -67,6 +102,10 @@ where latest_blocks = latest_blocks.filter_completed(cx); let latest_block = latest_blocks.min(); + cx.metrics + .deployment_target + .update(latest_block.min(cx.max_end_block())); + let mut deployment_is_failed = cx.store.health().await?.is_failed(); let mut entity_cache = EntityCache::new(cx.store.cheap_clone()); let mut stream = new_data_stream(cx, latest_block); @@ -108,6 +147,8 @@ where match run_indexing(cx).await { Ok(()) => return Ok(()), Err(e) => { + cx.metrics.deployment_status.failed(); + let deterministic = e.is_deterministic(); cx.store diff --git a/core/src/amp_subgraph/runner/reorg_handler.rs b/core/src/amp_subgraph/runner/reorg_handler.rs index e512d536fac..911c4ebf818 100644 --- a/core/src/amp_subgraph/runner/reorg_handler.rs +++ b/core/src/amp_subgraph/runner/reorg_handler.rs @@ -46,6 +46,8 @@ where return Ok(()); }; + let _section = cx.metrics.stopwatch.start_section("handle_reorg"); + debug!(logger, "Handling reorg"; "latest_synced_block" => latest_synced_block_number, "latest_block_before_reorg" => ?latest_block_before_reorg.block_number diff --git a/graph/Cargo.toml b/graph/Cargo.toml index 9c55beb4c28..914986b3b8b 100644 --- a/graph/Cargo.toml +++ b/graph/Cargo.toml @@ -80,7 +80,7 @@ tokio-stream = { version = "0.1.15", features = ["sync"] } tokio-retry = "0.3.0" toml = "0.9.7" url = "2.5.7" -prometheus = "0.14.0" +prometheus.workspace = true priority-queue = "2.6.0" tonic = { workspace = true } prost = { workspace = true } diff --git a/graph/src/components/metrics/registry.rs b/graph/src/components/metrics/registry.rs index 93cf51b3bd1..b41f27bc785 100644 --- a/graph/src/components/metrics/registry.rs +++ b/graph/src/components/metrics/registry.rs @@ -1,8 +1,8 @@ use std::collections::HashMap; use std::sync::{Arc, RwLock}; -use prometheus::IntGauge; use prometheus::{labels, Histogram, IntCounterVec}; +use prometheus::{IntCounter, IntGauge}; use slog::debug; use crate::components::metrics::{counter_with_labels, gauge_with_labels}; @@ -349,6 +349,23 @@ impl MetricsRegistry { Ok(counter) } + pub fn new_int_counter( + &self, + name: impl AsRef, + help: impl AsRef, + const_labels: impl IntoIterator, + ) -> Result { + let opts = Opts::new(name.as_ref(), help.as_ref()).const_labels( + const_labels + .into_iter() + .map(|(key, value)| (key.to_string(), value.to_string())) + .collect(), + ); + let int_counter = IntCounter::with_opts(opts)?; + self.register(name.as_ref(), Box::new(int_counter.clone())); + Ok(int_counter) + } + pub fn new_counter_with_labels( &self, name: &str, @@ -500,12 +517,12 @@ impl MetricsRegistry { &self, name: impl AsRef, help: impl AsRef, - const_labels: impl IntoIterator, impl Into)>, + const_labels: impl IntoIterator, ) -> Result { let opts = Opts::new(name.as_ref(), help.as_ref()).const_labels( const_labels .into_iter() - .map(|(a, b)| (a.into(), b.into())) + .map(|(key, value)| (key.to_string(), value.to_string())) .collect(), ); let gauge = IntGauge::with_opts(opts)?; From eb2bf43b39cab7d1a9173512d197e0cca04b5705 Mon Sep 17 00:00:00 2001 From: Ion Suman <47307091+isum@users.noreply.github.com> Date: Thu, 20 Nov 2025 19:10:25 -0300 Subject: [PATCH 34/40] fix(graph): allow more complex dataset and table names --- graph/src/amp/manifest/data_source/raw.rs | 11 +- .../sql/query_builder/block_range_query.rs | 53 +++---- graph/src/amp/sql/query_builder/mod.rs | 2 +- .../amp/sql/query_builder/table_extractor.rs | 134 ++++++++++++------ .../amp/sql/query_builder/table_validator.rs | 41 +++--- 5 files changed, 154 insertions(+), 87 deletions(-) diff --git a/graph/src/amp/manifest/data_source/raw.rs b/graph/src/amp/manifest/data_source/raw.rs index 38eda81b137..3129555f491 100644 --- a/graph/src/amp/manifest/data_source/raw.rs +++ b/graph/src/amp/manifest/data_source/raw.rs @@ -145,7 +145,9 @@ impl RawSource { end_block, } = self; - validate_ident(&dataset).map_err(|e| e.source_context("invalid `dataset`"))?; + if dataset.is_empty() { + return Err(Error::InvalidValue(anyhow!("`dataset` cannot be empty"))); + } Self::validate_tables(&tables)?; let address = address.unwrap_or(Address::ZERO); @@ -181,8 +183,11 @@ impl RawSource { } for (i, table) in tables.iter().enumerate() { - validate_ident(table) - .map_err(|e| e.source_context(format!("invalid `table` at index {i}")))?; + if table.is_empty() { + return Err(Error::InvalidValue(anyhow!( + "`table` at index {i} cannot be empty" + ))); + } } Ok(()) diff --git a/graph/src/amp/sql/query_builder/block_range_query.rs b/graph/src/amp/sql/query_builder/block_range_query.rs index e82966a5346..e9b91ca5136 100644 --- a/graph/src/amp/sql/query_builder/block_range_query.rs +++ b/graph/src/amp/sql/query_builder/block_range_query.rs @@ -8,7 +8,7 @@ use ahash::RandomState; use alloy::primitives::BlockNumber; use sqlparser_latest::ast::{self, VisitMut, VisitorMut}; -use super::{extract_tables, normalize_table, parse_query}; +use super::{extract_tables, parse_query, TableReference}; /// Limits the query execution to the specified block range. /// @@ -56,7 +56,7 @@ pub(super) fn new_block_range_query<'a>( fn new_tables_to_ctes_mapping( query: &ast::Query, hasher: &mut impl Hasher, -) -> BTreeMap { +) -> BTreeMap { extract_tables(query) .into_iter() .map(|table| { @@ -69,12 +69,12 @@ fn new_tables_to_ctes_mapping( /// Visits the SQL query AST and replaces referenced table names with CTE names. struct TableReplacer { - tables_to_ctes_mapping: BTreeMap, + tables_to_ctes_mapping: BTreeMap, } impl TableReplacer { /// Creates a new table replacer. - fn new(tables_to_ctes_mapping: BTreeMap) -> Self { + fn new(tables_to_ctes_mapping: BTreeMap) -> Self { Self { tables_to_ctes_mapping, } @@ -86,7 +86,10 @@ impl TableReplacer { return; }; - let Some(cte_table) = self.tables_to_ctes_mapping.get(&normalize_table(name)) else { + let Some(cte_table) = self + .tables_to_ctes_mapping + .get(&TableReference::with_object_name(name)) + else { return; }; @@ -133,20 +136,20 @@ mod tests { assert_eq!( block_range_query, parse_query( - " - WITH block_range_14621009630487609643 AS ( - SELECT * FROM d WHERE _block_num BETWEEN 0 AND 1000000 + r#" + WITH block_range_1164572571450379730 AS ( + SELECT * FROM "d" WHERE _block_num BETWEEN 0 AND 1000000 ), - source_14621009630487609643 AS ( - SELECT a, b, c FROM block_range_14621009630487609643 AS d + source_1164572571450379730 AS ( + SELECT a, b, c FROM block_range_1164572571450379730 AS d ) SELECT - source_14621009630487609643.* + source_1164572571450379730.* FROM - source_14621009630487609643 + source_1164572571450379730 ORDER BY - source_14621009630487609643.b - " + source_1164572571450379730.b + "# ) .unwrap(), ) @@ -162,23 +165,23 @@ mod tests { assert_eq!( block_range_query, parse_query( - " - WITH block_range_14621009630487609643 AS ( - SELECT * FROM d WHERE _block_num BETWEEN 0 AND 1000000 + r#" + WITH block_range_1164572571450379730 AS ( + SELECT * FROM "d" WHERE _block_num BETWEEN 0 AND 1000000 ), - block_range_12377422807768256314 AS ( - SELECT * FROM e WHERE _block_num BETWEEN 0 AND 1000000 + block_range_13063992259633584610 AS ( + SELECT * FROM "e" WHERE _block_num BETWEEN 0 AND 1000000 ), - source_12377422807768256314 AS ( - SELECT a, b, c FROM block_range_14621009630487609643 AS d JOIN block_range_12377422807768256314 AS e ON e.e = d.d + source_13063992259633584610 AS ( + SELECT a, b, c FROM block_range_1164572571450379730 AS d JOIN block_range_13063992259633584610 AS e ON e.e = d.d ) SELECT - source_12377422807768256314.* + source_13063992259633584610.* FROM - source_12377422807768256314 + source_13063992259633584610 ORDER BY - source_12377422807768256314.b - " + source_13063992259633584610.b + "# ) .unwrap(), ) diff --git a/graph/src/amp/sql/query_builder/mod.rs b/graph/src/amp/sql/query_builder/mod.rs index 8a16b1a831f..5f5458ec092 100644 --- a/graph/src/amp/sql/query_builder/mod.rs +++ b/graph/src/amp/sql/query_builder/mod.rs @@ -22,7 +22,7 @@ use self::{ event_signature_resolver::resolve_event_signatures, parser::parse_query, source_address_resolver::resolve_source_address, - table_extractor::{extract_tables, normalize_table}, + table_extractor::{extract_tables, TableReference}, table_validator::validate_tables, }; diff --git a/graph/src/amp/sql/query_builder/table_extractor.rs b/graph/src/amp/sql/query_builder/table_extractor.rs index 0161e55fd49..b3cbc9d9d03 100644 --- a/graph/src/amp/sql/query_builder/table_extractor.rs +++ b/graph/src/amp/sql/query_builder/table_extractor.rs @@ -1,34 +1,84 @@ -use std::{collections::BTreeSet, ops::ControlFlow}; +use std::{collections::BTreeSet, fmt, ops::ControlFlow}; -use itertools::Itertools; use sqlparser_latest::ast::{self, Visit, Visitor}; /// Returns all tables that are referenced by the SQL query. /// /// The table names are lowercased and quotes are ignored. -pub(super) fn extract_tables(query: &ast::Query) -> BTreeSet { +pub(super) fn extract_tables(query: &ast::Query) -> BTreeSet { let mut table_extractor = TableExtractor::new(); let _: ControlFlow<()> = Visit::visit(query, &mut table_extractor); table_extractor.tables } -/// Returns the normalized table name. +/// Contains a normalized table reference. /// -/// The table name is lowercased and quotes are ignored. -pub(super) fn normalize_table(object_name: &ast::ObjectName) -> String { - object_name - .0 - .iter() - .map(|part| match part { - ast::ObjectNamePart::Identifier(ident) => ident.value.to_lowercase(), - }) - .join(".") +/// Used to compare physical table references with CTE names and custom tables. +#[derive(Debug, Clone, PartialEq, Eq, PartialOrd, Ord, Hash)] +pub(super) struct TableReference(ast::ObjectName); + +impl TableReference { + const QUOTE_STYLE: char = '"'; + + /// Creates a new table reference from a custom dataset and table. + pub(super) fn new(dataset: &str, table: &str) -> Self { + Self( + vec![ + ast::Ident::with_quote(Self::QUOTE_STYLE, dataset), + ast::Ident::with_quote(Self::QUOTE_STYLE, table), + ] + .into(), + ) + } + + /// Creates a new table reference from an object name. + pub(super) fn with_object_name(object_name: &ast::ObjectName) -> Self { + Self::with_idents( + object_name + .0 + .iter() + .map(|object_name_part| match object_name_part { + ast::ObjectNamePart::Identifier(ident) => ident, + }), + ) + } + + /// Creates a new table reference from a list of identifiers. + pub(super) fn with_idents<'a>(idents: impl IntoIterator) -> Self { + Self( + idents + .into_iter() + .map(|ident| { + let ast::Ident { + value, + quote_style, + span: _, + } = ident; + + ast::Ident::with_quote(Self::QUOTE_STYLE, { + if quote_style.is_none() { + value.to_lowercase() + } else { + value.to_owned() + } + }) + }) + .collect::>() + .into(), + ) + } +} + +impl fmt::Display for TableReference { + fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result { + write!(f, "{}", self.0) + } } /// Visits the SQL query AST and extracts referenced table names, ignoring CTEs. struct TableExtractor { - tables: BTreeSet, + tables: BTreeSet, cte_stack: CteStack, } @@ -47,13 +97,12 @@ impl TableExtractor { return; }; - let table = normalize_table(name); - - if self.cte_stack.contains(&table) { + let table_reference = TableReference::with_object_name(name); + if self.cte_stack.contains(&table_reference) { return; } - self.tables.insert(table); + self.tables.insert(table_reference); } } @@ -81,7 +130,7 @@ impl Visitor for TableExtractor { /// Maintains a list of active CTEs for each subquery scope. struct CteStack { - stack: Vec>, + stack: Vec>, } impl CteStack { @@ -90,9 +139,11 @@ impl CteStack { Self { stack: Vec::new() } } - /// Returns `true` if the `table_name` is present in the CTE list at any scope. - fn contains(&self, table_name: &str) -> bool { - self.stack.iter().any(|scope| scope.contains(table_name)) + /// Returns `true` if the `table_reference` is present in the CTE list at any scope. + fn contains(&self, table_reference: &TableReference) -> bool { + self.stack + .iter() + .any(|scope| scope.contains(table_reference)) } /// Creates a new subquery scope with all the CTEs of the current `query`. @@ -101,7 +152,7 @@ impl CteStack { Some(with) => with .cte_tables .iter() - .map(|cte_table| cte_table.alias.name.value.to_lowercase()) + .map(|cte_table| TableReference::with_idents([&cte_table.alias.name])) .collect(), None => BTreeSet::new(), }; @@ -126,28 +177,31 @@ mod tests { #[test] fn $name() { let query = parse_query($input).unwrap(); - assert_eq!(extract_tables(&query), $expected.into_iter().map(Into::into).collect()); + assert_eq!( + extract_tables(&query).into_iter().map(|table| table.to_string()).collect::>(), + $expected.into_iter().map(|table| table.to_string()).collect::>() + ); } )* }; } test_extract_tables! { - one_table: "SELECT a FROM b" => ["b"], - multiple_tables_with_one_join: "SELECT a FROM b JOIN c ON c.c = b.b" => ["b", "c"], - multiple_tables_with_multiple_joins: "SELECT a FROM b JOIN c ON c.c = b.b JOIN d ON d.d = b.b" => ["b", "c", "d"], - one_table_with_one_cte: "WITH a AS (SELECT * FROM b) SELECT * FROM a" => ["b"], - one_table_with_multiple_ctes: "WITH a AS (SELECT * FROM b), c AS (SELECT * FROM a) SELECT * FROM c" => ["b"], - multiple_tables_with_multiple_ctes: "WITH a AS (SELECT * FROM b), c AS (SELECT * FROM d) SELECT * FROM a JOIN c ON c.c = a.a" => ["b", "d"], - multiple_tables_with_nested_ctes: "WITH a AS (WITH b AS (SELECT * FROM c) SELECT * FROM d JOIN b ON b.b = d.d) SELECT * FROM a" => ["c", "d"], - multiple_tables_with_union: "SELECT a FROM b UNION SELECT c FROM d" => ["b", "d"], - multiple_tables_with_union_all: "SELECT a FROM b UNION ALL SELECT c FROM d" => ["b", "d"], - - namespace_is_preserved: "SELECT a FROM b.c" => ["b.c"], - catalog_is_preserved: "SELECT a FROM b.c.d" => ["b.c.d"], - tables_are_lowercased: "SELECT a FROM B.C" => ["b.c"], - single_quotes_in_tables_are_ignored: "SELECT a FROM 'B'.'C'" => ["b.c"], - double_quotes_in_tables_are_ignored: r#"SELECT a FROM "B"."C""# => ["b.c"], - backticks_in_tables_are_ignored: "SELECT a FROM `B`.`C`" => ["b.c"], + one_table: "SELECT a FROM b" => [r#""b""#], + multiple_tables_with_one_join: "SELECT a FROM b JOIN c ON c.c = b.b" => [r#""b""#, r#""c""#], + multiple_tables_with_multiple_joins: "SELECT a FROM b JOIN c ON c.c = b.b JOIN d ON d.d = b.b" => [r#""b""#, r#""c""#, r#""d""#], + one_table_with_one_cte: "WITH a AS (SELECT * FROM b) SELECT * FROM a" => [r#""b""#], + one_table_with_multiple_ctes: "WITH a AS (SELECT * FROM b), c AS (SELECT * FROM a) SELECT * FROM c" => [r#""b""#], + multiple_tables_with_multiple_ctes: "WITH a AS (SELECT * FROM b), c AS (SELECT * FROM d) SELECT * FROM a JOIN c ON c.c = a.a" => [r#""b""#, r#""d""#], + multiple_tables_with_nested_ctes: "WITH a AS (WITH b AS (SELECT * FROM c) SELECT * FROM d JOIN b ON b.b = d.d) SELECT * FROM a" => [r#""c""#, r#""d""#], + multiple_tables_with_union: "SELECT a FROM b UNION SELECT c FROM d" => [r#""b""#, r#""d""#], + multiple_tables_with_union_all: "SELECT a FROM b UNION ALL SELECT c FROM d" => [r#""b""#, r#""d""#], + + namespace_is_preserved: "SELECT a FROM b.c" => [r#""b"."c""#], + catalog_is_preserved: "SELECT a FROM b.c.d" => [r#""b"."c"."d""#], + unquoted_tables_are_lowercased: "SELECT a FROM B.C" => [r#""b"."c""#], + single_quotes_in_tables_are_converted_to_double_quotes: "SELECT a FROM 'B'.'C'" => [r#""B"."C""#], + double_quotes_in_tables_are_preserved: r#"SELECT a FROM "B"."C""# => [r#""B"."C""#], + backticks_in_tables_are_converted_to_double_quotes: "SELECT a FROM `B`.`C`" => [r#""B"."C""#], } } diff --git a/graph/src/amp/sql/query_builder/table_validator.rs b/graph/src/amp/sql/query_builder/table_validator.rs index d1cd256c9f2..c3aac82f2d3 100644 --- a/graph/src/amp/sql/query_builder/table_validator.rs +++ b/graph/src/amp/sql/query_builder/table_validator.rs @@ -3,7 +3,7 @@ use std::collections::BTreeSet; use anyhow::{bail, Result}; use sqlparser_latest::ast; -use super::extract_tables; +use super::{extract_tables, TableReference}; /// Validates that SQL query references only allowed dataset and tables. /// @@ -26,11 +26,10 @@ pub(super) fn validate_tables<'a>( bail!("query does not use any tables"); } - let allowed_dataset = allowed_dataset.to_lowercase(); let allowed_tables = allowed_tables .into_iter() - .map(|allowed_table| format!("{allowed_dataset}.{}", allowed_table.to_lowercase())) - .collect::>(); + .map(|allowed_table| TableReference::new(allowed_dataset, allowed_table)) + .collect::>(); for used_table in used_tables { if !allowed_tables.contains(&used_table) { @@ -69,26 +68,32 @@ mod tests { test_validate_tables! { no_table_references: "SELECT *", "a", ["b"] => Err("query does not use any tables"), - missing_dataset: "SELECT * FROM b", "a", ["b"] => Err("table 'b' not allowed"), - missing_table: "SELECT * FROM a", "a", ["b"] => Err("table 'a' not allowed"), - invalid_dataset: "SELECT * FROM c.b", "a", ["b"] => Err("table 'c.b' not allowed"), - invalid_nested_dataset: "WITH a AS (SELECT * FROM c.b) SELECT * FROM a", "a", ["b"] => Err("table 'c.b' not allowed"), - invalid_table: "SELECT * FROM a.c", "a", ["b"] => Err("table 'a.c' not allowed"), - invalid_nested_table: "WITH a AS (SELECT * FROM a.c) SELECT * FROM a", "a", ["b"] => Err("table 'a.c' not allowed"), - using_catalog: "SELECT * FROM c.a.b", "a", ["b"] => Err("table 'c.a.b' not allowed"), + missing_dataset: "SELECT * FROM b", "a", ["b"] => Err(r#"table '"b"' not allowed"#), + missing_table: "SELECT * FROM a", "a", ["b"] => Err(r#"table '"a"' not allowed"#), + invalid_dataset: "SELECT * FROM c.b", "a", ["b"] => Err(r#"table '"c"."b"' not allowed"#), + invalid_nested_dataset: "WITH a AS (SELECT * FROM c.b) SELECT * FROM a", "a", ["b"] => Err(r#"table '"c"."b"' not allowed"#), + invalid_table: "SELECT * FROM a.c", "a", ["b"] => Err(r#"table '"a"."c"' not allowed"#), + invalid_nested_table: "WITH a AS (SELECT * FROM a.c) SELECT * FROM a", "a", ["b"] => Err(r#"table '"a"."c"' not allowed"#), + using_catalog: "SELECT * FROM c.a.b", "a", ["b"] => Err(r#"table '"c"."a"."b"' not allowed"#), one_valid_table: "SELECT * FROM a.b", "a", ["b"] => Ok(()), one_valid_nested_table: "WITH a AS (SELECT * FROM a.b) SELECT * FROM a", "a", ["b"] => Ok(()), multiple_valid_tables: "SELECT * FROM a.b JOIN a.c ON a.c.c = a.b.b", "a", ["b", "c"] => Ok(()), multiple_valid_nested_tables: "WITH a AS (SELECT * FROM a.b JOIN a.c ON a.c.c = a.b.b) SELECT * FROM a", "a", ["b", "c"] => Ok(()), - single_quotes_are_ignored: "SELECT * FROM 'a'.'b'", "a", ["b"] => Ok(()), - double_quotes_are_ignored: r#"SELECT * FROM "a"."b""#, "a", ["b"] => Ok(()), - backticks_are_ignored: "SELECT * FROM `a`.`b`", "a", ["b"] => Ok(()), + unquoted_dataset_is_case_insensitive: "SELECT * FROM A.b", "a", ["b"] => Ok(()), + unquoted_tables_are_case_insensitive: "SELECT * FROM a.B", "a", ["b"] => Ok(()), - dataset_is_case_insensitive: "SELECT * FROM A.b", "a", ["b"] => Ok(()), - tables_are_case_insensitive: "SELECT * FROM a.B", "a", ["b"] => Ok(()), - allowed_dataset_is_case_insensitive: "SELECT * FROM a.b", "A", ["b"] => Ok(()), - allowrd_tables_are_case_insensitive: "SELECT * FROM a.b", "a", ["B"] => Ok(()), + single_quoted_dataset_is_case_sensitive: "SELECT * FROM 'A'.b", "a", ["b"] => Err(r#"table '"A"."b"' not allowed"#), + single_quoted_tables_are_case_sensitive: "SELECT * FROM a.'B'", "a", ["b"] => Err(r#"table '"a"."B"' not allowed"#), + + double_quoted_dataset_is_case_sensitive: r#"SELECT * FROM "A".b"#, "a", ["b"] => Err(r#"table '"A"."b"' not allowed"#), + double_quoted_tables_are_case_sensitive: r#"SELECT * FROM a."B""#, "a", ["b"] => Err(r#"table '"a"."B"' not allowed"#), + + backtick_quoted_dataset_is_case_sensitive: "SELECT * FROM `A`.b", "a", ["b"] => Err(r#"table '"A"."b"' not allowed"#), + backtick_quoted_tables_are_case_sensitive: "SELECT * FROM a.`B`", "a", ["b"] => Err(r#"table '"a"."B"' not allowed"#), + + allowed_dataset_is_case_sensitive: "SELECT * FROM a.b", "A", ["b"] => Err(r#"table '"a"."b"' not allowed"#), + allowed_tables_are_case_sensitive: "SELECT * FROM a.b", "a", ["B"] => Err(r#"table '"a"."b"' not allowed"#), } } From e4d71e898aedabfe33fbe5eeacf935a83ba5a4d1 Mon Sep 17 00:00:00 2001 From: Ion Suman <47307091+isum@users.noreply.github.com> Date: Thu, 20 Nov 2025 19:12:59 -0300 Subject: [PATCH 35/40] fix(graph): remove CTE name requirements --- graph/src/amp/sql/query_builder/parser.rs | 44 ----------------------- 1 file changed, 44 deletions(-) diff --git a/graph/src/amp/sql/query_builder/parser.rs b/graph/src/amp/sql/query_builder/parser.rs index 31ff4d068c8..1f965b955b6 100644 --- a/graph/src/amp/sql/query_builder/parser.rs +++ b/graph/src/amp/sql/query_builder/parser.rs @@ -16,7 +16,6 @@ use sqlparser_latest::{ /// - The SQL query cannot be parsed /// - The SQL query contains multiple SQL statements /// - The SQL query is not a `SELECT` query -/// - The SQL query contains CTEs with quoted names /// /// The returned error is deterministic. pub(super) fn parse_query(s: impl AsRef) -> Result { @@ -35,10 +34,6 @@ pub(super) fn parse_query(s: impl AsRef) -> Result { return Err(e); } - if let ControlFlow::Break(e) = query.visit(&mut AllowOnlyUnquotedCtes) { - return Err(e); - } - Ok(query) } @@ -76,41 +71,6 @@ impl Visitor for AllowOnlySelectQueries { } } -/// Validates that CTE names in the SQL query AST do not use quotes. -/// -/// This is a temporary solution that allows proper identification of table references. -struct AllowOnlyUnquotedCtes; - -impl AllowOnlyUnquotedCtes { - /// Returns an error if the `query` contains CTEs with quoted names. - fn visit_query(&self, query: &ast::Query) -> Result<()> { - let Some(with) = &query.with else { - return Ok(()); - }; - - for cte_table in &with.cte_tables { - let cte_name = &cte_table.alias.name; - - if cte_name.quote_style.is_some() { - bail!("invalid CTE {cte_name}: CTE names with quotes are not allowed"); - } - } - - Ok(()) - } -} - -impl Visitor for AllowOnlyUnquotedCtes { - type Break = anyhow::Error; - - fn pre_visit_query(&mut self, query: &ast::Query) -> ControlFlow { - match self.visit_query(query) { - Ok(()) => ControlFlow::Continue(()), - Err(e) => ControlFlow::Break(e), - } - } -} - #[cfg(test)] mod tests { use super::*; @@ -151,9 +111,5 @@ mod tests { valid_query: "SELECT a FROM b" => Ok("SELECT a FROM b"), valid_query_with_cte: "WITH a AS (SELECT b FROM c) SELECT * FROM a" => Ok("WITH a AS (SELECT b FROM c) SELECT * FROM a"), valid_query_with_join: "SELECT a FROM b INNER JOIN c ON c.c = b.b" => Ok("SELECT a FROM b INNER JOIN c ON c.c = b.b"), - - single_quoted_ctes_not_allowed: "WITH 'a' AS (SELECT * FROM b) SELECT * FROM a" => Err("invalid CTE 'a': CTE names with quotes are not allowed"), - double_quoted_ctes_not_allowed: r#"WITH "a" AS (SELECT * FROM b) SELECT * FROM a"# => Err(r#"invalid CTE "a": CTE names with quotes are not allowed"#), - backticked_ctes_not_allowed: "WITH `a` AS (SELECT * FROM b) SELECT * FROM a" => Err("invalid CTE `a`: CTE names with quotes are not allowed"), } } From 459028cb3007d601bc0ed3c018f367ca628c65a5 Mon Sep 17 00:00:00 2001 From: Ion Suman <47307091+isum@users.noreply.github.com> Date: Thu, 20 Nov 2025 19:33:19 -0300 Subject: [PATCH 36/40] fix(graph, node): add option to authenticate Flight service requests --- graph/src/amp/client/flight_client.rs | 14 +++++++++++++- graph/src/env/amp.rs | 11 +++++++++++ graph/src/env/mod.rs | 2 ++ node/src/launcher.rs | 6 +++++- node/src/manager/commands/run.rs | 13 ++++++++----- 5 files changed, 39 insertions(+), 7 deletions(-) diff --git a/graph/src/amp/client/flight_client.rs b/graph/src/amp/client/flight_client.rs index d8a4f154632..588f1a97762 100644 --- a/graph/src/amp/client/flight_client.rs +++ b/graph/src/amp/client/flight_client.rs @@ -33,6 +33,7 @@ use crate::{ /// using the Apache Arrow Flight protocol. pub struct FlightClient { channel: Channel, + auth_token: Option, } impl FlightClient { @@ -56,16 +57,27 @@ impl FlightClient { Ok(Self { channel: endpoint.connect().await.map_err(Error::Connection)?, + auth_token: None, }) } + /// Sets the authentication token for requests to the Amp server. + pub fn set_auth_token(&mut self, auth_token: impl Into) { + self.auth_token = Some(auth_token.into()); + } + fn raw_client(&self) -> FlightSqlServiceClient { let channel = self.channel.cheap_clone(); let client = FlightServiceClient::new(channel) .max_encoding_message_size(256 * 1024 * 1024) .max_decoding_message_size(256 * 1024 * 1024); - FlightSqlServiceClient::new_from_inner(client) + let mut client = FlightSqlServiceClient::new_from_inner(client); + if let Some(auth_token) = &self.auth_token { + client.set_token(auth_token.clone()); + } + + client } } diff --git a/graph/src/env/amp.rs b/graph/src/env/amp.rs index 909db4134ad..ef4fff7c1dc 100644 --- a/graph/src/env/amp.rs +++ b/graph/src/env/amp.rs @@ -24,6 +24,11 @@ pub struct AmpEnv { /// /// Defaults to `600` seconds. pub query_retry_max_delay: Duration, + + /// Token used to authenticate Amp Flight gRPC service requests. + /// + /// Defaults to `None`. + pub flight_service_token: Option, } impl AmpEnv { @@ -60,6 +65,12 @@ impl AmpEnv { .amp_query_retry_max_delay_seconds .map(Duration::from_secs) .unwrap_or(Self::DEFAULT_QUERY_RETRY_MAX_DELAY), + flight_service_token: raw_env.amp_flight_service_token.as_ref().and_then(|value| { + if value.is_empty() { + return None; + } + Some(value.to_string()) + }), } } } diff --git a/graph/src/env/mod.rs b/graph/src/env/mod.rs index 00624c5ab75..09657c041f5 100644 --- a/graph/src/env/mod.rs +++ b/graph/src/env/mod.rs @@ -602,6 +602,8 @@ struct Inner { amp_query_retry_min_delay_seconds: Option, #[envconfig(from = "GRAPH_AMP_QUERY_RETRY_MAX_DELAY_SECONDS")] amp_query_retry_max_delay_seconds: Option, + #[envconfig(from = "GRAPH_AMP_FLIGHT_SERVICE_TOKEN")] + amp_flight_service_token: Option, } #[derive(Clone, Debug)] diff --git a/node/src/launcher.rs b/node/src/launcher.rs index 8195f6fd7fb..3bbccf5cf0e 100644 --- a/node/src/launcher.rs +++ b/node/src/launcher.rs @@ -505,10 +505,14 @@ pub async fn run( .parse() .expect("Invalid Amp Flight service address"); - let amp_client = amp::FlightClient::new(addr) + let mut amp_client = amp::FlightClient::new(addr) .await .expect("Failed to connect to Amp Flight service"); + if let Some(auth_token) = &env_vars.amp.flight_service_token { + amp_client.set_auth_token(auth_token); + } + Some(Arc::new(amp_client)) } None => None, diff --git a/node/src/manager/commands/run.rs b/node/src/manager/commands/run.rs index bd80dedea0e..473a12e5d17 100644 --- a/node/src/manager/commands/run.rs +++ b/node/src/manager/commands/run.rs @@ -150,12 +150,15 @@ pub async fn run( .parse() .expect("Invalid Amp Flight service address"); - let amp_client = Arc::new( - amp::FlightClient::new(addr) - .await - .expect("Failed to connect to Amp Flight service"), - ); + let mut amp_client = amp::FlightClient::new(addr) + .await + .expect("Failed to connect to Amp Flight service"); + + if let Some(auth_token) = &env_vars.amp.flight_service_token { + amp_client.set_auth_token(auth_token); + } + let amp_client = Arc::new(amp_client); let amp_instance_manager = graph_core::amp_subgraph::Manager::new( &logger_factory, metrics_registry.cheap_clone(), From b7c720bc466e4066138d9d4af46635e5541ef607 Mon Sep 17 00:00:00 2001 From: Ion Suman <47307091+isum@users.noreply.github.com> Date: Fri, 21 Nov 2025 14:46:23 -0300 Subject: [PATCH 37/40] fix(graph): update temporary predefined list of source context tables --- graph/src/amp/manifest/data_source/raw.rs | 29 +++++++++++++++++++---- 1 file changed, 24 insertions(+), 5 deletions(-) diff --git a/graph/src/amp/manifest/data_source/raw.rs b/graph/src/amp/manifest/data_source/raw.rs index 3129555f491..4f4d8cef71d 100644 --- a/graph/src/amp/manifest/data_source/raw.rs +++ b/graph/src/amp/manifest/data_source/raw.rs @@ -84,7 +84,7 @@ impl RawDataSource { .map_err(|e| e.source_context("invalid `source`"))?; let transformer = transformer - .resolve(&logger, link_resolver, amp_client, &source) + .resolve(&logger, link_resolver, amp_client, &network, &source) .await .map_err(|e| e.source_context("invalid `transformer`"))?; @@ -222,6 +222,7 @@ impl RawTransformer { logger: &Logger, link_resolver: &dyn LinkResolver, amp_client: &impl amp::Client, + network: &str, source: &Source, ) -> Result { let Self { @@ -232,8 +233,16 @@ impl RawTransformer { Self::validate_api_version(&api_version)?; let abis = Self::resolve_abis(logger, link_resolver, abis).await?; - let tables = - Self::resolve_tables(logger, link_resolver, amp_client, tables, source, &abis).await?; + let tables = Self::resolve_tables( + logger, + link_resolver, + amp_client, + network, + tables, + source, + &abis, + ) + .await?; Ok(Transformer { api_version, @@ -285,6 +294,7 @@ impl RawTransformer { logger: &Logger, link_resolver: &dyn LinkResolver, amp_client: &impl amp::Client, + network: &str, tables: Vec, source: &Source, abis: &[Abi], @@ -308,7 +318,7 @@ impl RawTransformer { ); table - .resolve(&logger, link_resolver, amp_client, source, abis) + .resolve(&logger, link_resolver, amp_client, network, source, abis) .await .map_err(|e| e.source_context(format!("invalid `tables` at index {i}"))) }); @@ -400,6 +410,7 @@ impl RawTable { logger: &Logger, link_resolver: &dyn LinkResolver, amp_client: &impl amp::Client, + network: &str, source: &Source, abis: &[Abi], ) -> Result { @@ -426,6 +437,7 @@ impl RawTable { let block_range_query_builder = Self::resolve_block_range_query_builder( logger, amp_client, + network, source, query, schema.clone(), @@ -522,6 +534,7 @@ impl RawTable { async fn resolve_block_range_query_builder( logger: &Logger, amp_client: &impl amp::Client, + network: &str, source: &Source, query: ValidQuery, schema: Schema, @@ -549,7 +562,13 @@ impl RawTable { .iter() .map(|table| (source.dataset.as_str(), table.as_str())) // TODO: Replace hardcoded values with schema metadata sources when available - .chain([("eth_firehose", "blocks"), ("eth_rpc", "blocks")]); + .chain(match network { + "ethereum-mainnet" => vec![("edgeandnode/ethereum_mainnet", "blocks")], + "base-mainnet" => vec![("edgeandnode/base_mainnet", "blocks")], + "base-sepolia" => vec![("edgeandnode/base_sepolia", "blocks")], + "arbitrum-one" => vec![("edgeandnode/arbitrum_one", "blocks")], + _ => vec![], + }); for (dataset, table) in context_sources_iter { let context_logger = logger.new(slog::o!( From 3ebfc279a7255ae3b9c10d768d722ca7a3191ddb Mon Sep 17 00:00:00 2001 From: Ion Suman <47307091+isum@users.noreply.github.com> Date: Fri, 21 Nov 2025 15:00:55 -0300 Subject: [PATCH 38/40] docs: add docs for Amp-powered subgraphs --- docs/amp-powered-subgraphs.md | 407 ++++++++++++++++++++++++++++++++++ 1 file changed, 407 insertions(+) create mode 100644 docs/amp-powered-subgraphs.md diff --git a/docs/amp-powered-subgraphs.md b/docs/amp-powered-subgraphs.md new file mode 100644 index 00000000000..26255a14938 --- /dev/null +++ b/docs/amp-powered-subgraphs.md @@ -0,0 +1,407 @@ +# Amp-powered subgraphs + +> [!NOTE] +> This features is available starting from spec version `1.4.0` + +Amp-powered subgraphs are a new kind of subgraphs with SQL data sources that query and index data from the Amp servers. +They are significantly more efficient than the standard subgraphs, and the indexing time can be reduced from days and weeks, +to minutes and hours in most cases. + +## Prerequisites + +To enable Amp-powered subgraphs, the `GRAPH_AMP_FLIGHT_SERVICE_ADDRESS` ENV variable must be set to a valid Amp Flight gRPC service address. + +Additionally, if authentication is required for the Amp Flight gRPC service, the `GRAPH_AMP_FLIGHT_SERVICE_TOKEN` ENV variable must contain a valid authentication token. + +## Subgraph manifest + +Amp-powered subgraphs introduce a new structure for defining Amp subgraph data sources within the manifest. + +### Spec version + +The minimum spec version for Amp-powered subgraphs is `1.4.0`. + +

+Example YAML: + +```yaml +specVersion: 1.4.0 +# .. other fields ... +``` +
+ +### Data source structure + +### `kind` + +Every Amp data source must have the `kind` set to `amp`, and Amp-powered subgraphs must contain only Amp data sources. +This is used to assign the subgraph to the appropriate indexing process. + +
+Example YAML: + +```yaml +dataSources: + - kind: amp + # .. other fields ... +``` +
+ +### `name` + +Every Amp data source must have the `name` set to a non-empty string, containing only numbers, letters, hypens, or underscores. +This name is used for observability purposes and to identify progress and potential errors produced by the data source. + +
+Example YAML: + +```yaml +dataSources: + - name: Transfers + # .. other fields ... +``` +
+ +### `network` + +Every Amp data source must have the `network` field set to a valid network name. +This is used to validate that the SQL queries for this data source produce results for the expected network. + +> [!NOTE] +> Currently, the SQL queries are required to produce results for a single network in order to maintain compatibility with non-Amp subgraphs. + +
+Example YAML: + +```yaml +dataSources: + - network: ethereum-mainnet + # .. other fields ... +``` +
+ +### `source` + +Every Amp data source must have a valid `source` that describes the behavior of SQL queries from this data source. + +### `source.dataset` + +Contains the name of the dataset that can be queried by SQL queries in this data source. +This is used to validate that the SQL queries for this data source only query the expected dataset. + +
+Example YAML: + +```yaml +dataSources: + - source: + dataset: edgeandnode/ethereum_mainnet + # .. other fields ... +``` +
+ +### `source.tables` + +Contains the names of the tables that can be queried by SQL queries in this data source. +This is used to validate that the SQL queries for this data source only query the expected tables. + +
+Example YAML: + +```yaml +dataSources: + - source: + tables: + - blocks + - transactions + # .. other fields ... +``` +
+ +### `source.address` + +Contains the contract address with which SQL queries in the data source interact. + +Enables SQL query reuse through `sg_source_address()` calls instead of hard-coding the contract address. +SQL queries resolve `sg_source_address()` calls to this contract address. + +
+Example YAML: + +```yaml +dataSources: + - source: + address: "0xc944E90C64B2c07662A292be6244BDf05Cda44a7" + # .. other fields ... +``` +
+ +### `source.startBlock` + +Contains the minimum block number that SQL queries in the data source can query. +This is used as a starting point for the indexing process. + +_When not provided, defaults to block number `0`._ + +
+Example YAML: + +```yaml +dataSources: + - source: + startBlock: 11446769 + # .. other fields ... +``` +
+ +### `source.endBlock` + +Contains the maximum block number that SQL queries in the data source can query. +Reaching this block number will complete the indexing process. + +_When not provided, defaults to the maximum possible block number._ + +
+Example YAML: + +```yaml +dataSources: + - source: + endBlock: 23847939 + # .. other fields ... +``` +
+ +### `transformer` + +Every Amp data source must have a valid `transformer` that describes the transformations of source tables indexed by the Amp-powered subgraph. + +### `transformer.apiVersion` + +Represents the version of this transformer. Each version may contain a different set of features. + +> [!NOTE] +> Currently, only the version `0.0.1` is available. + +
+Example YAML: + +```yaml +dataSource: + - transformer: + apiVersion: 0.0.1 + # .. other fields ... +``` +
+ +### `transformer.abis` + +Contains a list of ABIs that SQL queries can reference to extract event signatures. + +Enables the use of `sg_event_signature('CONTRACT_NAME', 'EVENT_NAME')` calls in the +SQL queries which are resolved to full event signatures based on this list. + +_When not provided, defaults to an empty list._ + +
+Example YAML: + +```yaml +dataSource: + - transformer: + abis: + - name: ERC721 # The name of the contract + file: + # .. other fields ... +``` +
+ +### `transformer.tables` + +Contains a list of transformed tables that extract data from source tables into subgraph entities. + +### Transformer table structure + +### `transformer.tables[i].name` + +Represents the name of the transformed table. Must reference a valid entity name from the subgraph schema. + +
+Example: + +**GraphQL schema:** + +```graphql +type Block @entity(immutable: true) { + # .. entity fields ... +} +``` + +**YAML manifest:** +```yaml +dataSource: + - transformer: + tables: + - name: Block + # .. other fields ... +``` +
+ +### `transformer.tables[i].query` + +Contains an inline SQL query that executes on the Amp server. +This is useful for simple SQL queries like `SELECT * FROM "edgeandnode/ethereum_mainnet".blocks;`. +For more complex cases, a separate file containing the SQL query can be used in the `file` field. + +The data resulting from this SQL query execution transforms into subgraph entities. + +_When not provided, the `file` field is used instead._ + +
+Example YAML: + +```yaml +dataSource: + - transformer: + tables: + - query: SELECT * FROM "edgeandnode/ethereum_mainnet".blocks; + # .. other fields ... +``` +
+ +### `transformer.tables[i].file` + +Contains the IPFS link to the SQL query that executes on the Amp server. + +The data resulting from this SQL query execution transforms into subgraph entities. + +_Ignored when the `query` field is provided._ +_When not provided, the `query` field is used instead._ + +
+Example YAML: + +```yaml +dataSource: + - transformer: + tables: + - file: + # .. other fields ... +``` +
+ +### Amp-powered subgraph examples + +Complete examples on how to create, deploy and query Amp-powered subgraphs are available in a separate repository: +https://github.com/edgeandnode/amp-subgraph-examples + +## SQL query requirements + +### Block numbers + +Every SQL query in Amp-powered subgraphs must return the block number for every row. +This is required because subgraphs rely on this information for storing subgraph entities. + +Graph-node will look for block numbers in the following columns: +`_block_num`, `block_num`, `blockNum`, `block`, `block_number`, `blockNumber`. + +Example SQL query: `SELECT _block_num, /* .. other projections .. */ FROM "edgeandnode/ethereum_mainnet".blocks;` + +### Block hashes + +Every SQL query in Amp-powered subgraphs is expected to return the block hash for every row. +This is required because subgraphs rely on this information for storing subgraph entities. + +When a SQL query does not have the block hash projection, graph-node will attempt to get it from the +source tables specified in the subgraph manifest. + +Graph-node will look for block hashes in the following columns: +`hash`, `block_hash`, `blockHash`. + +Example SQL query: `SELECT hash, /* .. other projections .. */ FROM "edgeandnode/ethereum_mainnet".blocks;` + +> [!NOTE] +> If a table does not contain the block hash column, it can be retrieved by joining that table with another that contains the column on the `_block_num` column. + +### Block timestamps + +Every SQL query in Amp-powered subgraphs is expected to return the block timestamps for every row. +This is required because subgraphs rely on this information for storing subgraph entities. + +When a SQL query does not have the block timestamps projection, graph-node will attempt to get it from the +source tables specified in the subgraph manifest. + +Graph-node will look for block timestamps in the following columns: +`timestamp`, `block_timestamp`, `blockTimestamp`. + +Example SQL query: `SELECT timestamp, /* .. other projections .. */ FROM "edgeandnode/ethereum_mainnet".blocks;` + +> [!NOTE] +> If a table does not contain the block timestamp column, it can be retrieved by joining that table with another that contains the column on the `_block_num` column. + +## Type conversions + +Amp core SQL data types are converted intuitively to compatible subgraph entity types. + +## Schema generation + +Amp-powered subgraphs support the generation of GraphQL schemas based on the schemas of SQL queries referenced in the subgraph manifest. +This is useful when indexing entities that do not rely on complex relationships, such as contract events. + +The generated subgraph entities are immutable. + +To enable schema generation, simply remove the `schema` field from the subgraph manifest. + +> [!NOTE] +> For more flexibility and control over the schema, a manually created GraphQL schema is preferred. + +## Aggregations + +Amp-powered subgraphs fully support the subgraph aggregations feature. +This allows having complex aggregations on top of data indexed from the Amp servers. + +For more information on using the powerful subgraph aggregations feature, +refer to the [documentation](https://github.com/graphprotocol/graph-node/blob/master/docs/aggregations.md). + +## Composition + +Amp-powered subgraphs fully support the subgraph composition feature. +This allows applying complex subgraph mappings on top of data indexed from the Amp servers. + +For more information on using the powerful subgraph composition feature, +refer to the [documentation](https://github.com/graphprotocol/example-composable-subgraph). + +## ENV variables + +Amp-powered subgraphs feature introduces the following new ENV variables: + +- `GRAPH_AMP_FLIGHT_SERVICE_ADDRESS` – The address of the Amp Flight gRPC service. _Defaults to `None`, which disables support for Amp-powered subgraphs._ +- `GRAPH_AMP_FLIGHT_SERVICE_TOKEN` – Token used to authenticate Amp Flight gRPC service requests. _Defaults to `None`, which disables authentication._ +- `GRAPH_AMP_MAX_BUFFER_SIZE` – Maximum number of response batches to buffer in memory per stream for each SQL query. _Defaults to `1,000`._ +- `GRAPH_AMP_MAX_BLOCK_RANGE` – Maximum number of blocks to request per stream for each SQL query. _Defaults to `2,000,000`._ +- `GRAPH_AMP_QUERY_RETRY_MIN_DELAY_SECONDS` – Minimum time to wait before retrying a failed SQL query to the Amp server. _Defaults to `1` second._ +- `GRAPH_AMP_QUERY_RETRY_MAX_DELAY_SECONDS` – Maximum time to wait before retrying a failed SQL query to the Amp server. _Defaults to `600` seconds._ + +## Metrics + +Amp-powered subgraphs feature introduces the following new metrics: + +- `amp_deployment_status` – Indicates the current indexing status of a deployment. + + **Possible values:** + - `1` - graph-node is preparing to start indexing; + - `2` - deployment is being indexed; + - `3` - indexing is stopped by request; + - `4` - indexing failed; +- `amp_deployment_head` – Tracks the most recent block number processed by a deployment. +- `amp_deployment_target` – Tracks the target block number of a deployment. +- `amp_deployment_synced` – Indicates whether a deployment has reached the chain head or the end block since it was deployed. + + **Possible values:** + - `0` - deployment is not synced; + - `1` - deployment is synced; +- `amp_deployment_indexing_duration_seconds` – Tracks the total duration in seconds of deployment indexing. +- `amp_deployment_blocks_processed_count` – Tracks the total number of blocks processed by a deployment. + + +Additionally, the `deployment_sync_secs` is extended with a new `amp-process` stage and new sections specific to the Amp indexing process. From 353f6e2f77633c38ff901cccfe386eb3d8f18e97 Mon Sep 17 00:00:00 2001 From: Ion Suman <47307091+isum@users.noreply.github.com> Date: Thu, 4 Dec 2025 16:43:12 +0200 Subject: [PATCH 39/40] chore(core): reuse existing metric names --- core/src/amp_subgraph/metrics.rs | 21 ++++++++++++--------- 1 file changed, 12 insertions(+), 9 deletions(-) diff --git a/core/src/amp_subgraph/metrics.rs b/core/src/amp_subgraph/metrics.rs index 1e74a4bcb9a..150b4d2b265 100644 --- a/core/src/amp_subgraph/metrics.rs +++ b/core/src/amp_subgraph/metrics.rs @@ -40,7 +40,10 @@ impl Metrics { store.shard().to_string(), ); - let const_labels = [("deployment", &deployment)]; + let const_labels = [ + ("deployment", deployment.to_string()), + ("shard", store.shard().to_string()), + ]; Self { deployment_status: DeploymentStatus::new(&metrics_registry, const_labels.clone()), @@ -69,7 +72,7 @@ impl DeploymentStatus { ) -> Self { let int_gauge = metrics_registry .new_int_gauge( - "amp_deployment_status", + "deployment_status", indoc!( " Indicates the current indexing status of a deployment. @@ -82,7 +85,7 @@ impl DeploymentStatus { ), const_labels, ) - .expect("failed to register `amp_deployment_status` gauge"); + .expect("failed to register `deployment_status` gauge"); Self(int_gauge) } @@ -118,11 +121,11 @@ impl DeploymentHead { ) -> Self { let int_gauge = metrics_registry .new_int_gauge( - "amp_deployment_head", + "deployment_head", "Tracks the most recent block number processed by a deployment", const_labels, ) - .expect("failed to register `amp_deployment_head` gauge"); + .expect("failed to register `deployment_head` gauge"); Self(int_gauge) } @@ -177,7 +180,7 @@ impl DeploymentSynced { ) -> Self { let int_gauge = metrics_registry .new_int_gauge( - "amp_deployment_synced", + "deployment_synced", indoc!( " Indicates whether a deployment has reached the chain head or the end block since it was deployed. @@ -188,7 +191,7 @@ impl DeploymentSynced { ), const_labels, ) - .expect("failed to register `amp_deployment_synced` gauge"); + .expect("failed to register `deployment_synced` gauge"); Self(int_gauge) } @@ -239,11 +242,11 @@ impl BlocksProcessed { ) -> Self { let int_counter = metrics_registry .new_int_counter( - "amp_deployment_blocks_processed_count", + "deployment_blocks_processed_count", "Tracks the total number of blocks processed by a deployment", const_labels, ) - .expect("failed to register `amp_deployment_blocks_processed_count` counter"); + .expect("failed to register `deployment_blocks_processed_count` counter"); Self(int_counter) } From 2418aa15aceb79325115d4ccd55cba4f97245f46 Mon Sep 17 00:00:00 2001 From: Ion Suman <47307091+isum@users.noreply.github.com> Date: Thu, 4 Dec 2025 19:58:13 +0200 Subject: [PATCH 40/40] fix(core, graph): minor adjustments after rebase --- core/src/amp_subgraph/manager.rs | 2 +- core/src/amp_subgraph/monitor.rs | 16 +- core/src/amp_subgraph/runner/compat.rs | 2 + .../amp_subgraph/runner/data_processing.rs | 18 +- core/src/amp_subgraph/runner/mod.rs | 65 ++-- docs/amp-powered-subgraphs.md | 340 +++++++++++++----- graph/src/amp/client/flight_client.rs | 2 +- graph/src/amp/common/mod.rs | 1 + graph/src/amp/manifest/data_source/raw.rs | 40 ++- graph/src/data/subgraph/mod.rs | 33 +- graph/src/data_source/mod.rs | 3 +- 11 files changed, 362 insertions(+), 160 deletions(-) diff --git a/core/src/amp_subgraph/manager.rs b/core/src/amp_subgraph/manager.rs index 041a30df226..ae272830880 100644 --- a/core/src/amp_subgraph/manager.rs +++ b/core/src/amp_subgraph/manager.rs @@ -146,7 +146,7 @@ where metrics, ); - let runner_result = runner::new_runner(runner_context)(cancel_token).await; + let runner_result = runner::new_runner(runner_context, cancel_token).await; match manager.subgraph_store.stop_subgraph(&deployment).await { Ok(()) => { diff --git a/core/src/amp_subgraph/monitor.rs b/core/src/amp_subgraph/monitor.rs index cfa1de2942d..7f2a09a0a91 100644 --- a/core/src/amp_subgraph/monitor.rs +++ b/core/src/amp_subgraph/monitor.rs @@ -2,9 +2,9 @@ //! //! # Terminology used in this module //! -//! `active subgraph` - A subgraph that was started and is still tracked. +//! `active subgraph` - A subgraph that was started and is still kept in memory in the list of started subgraphs. //! `running subgraph` - A subgraph that has an instance that is making progress or stopping. -//! `subgraph instance` - A background process that executes the subgraph runner future. +//! `subgraph instance` - A background task that executes the subgraph runner future. use std::{ collections::{hash_map::Entry, HashMap}, @@ -65,7 +65,7 @@ pub(super) struct Monitor { /// The channel that is used to send subgraph commands. /// /// Every subgraph start and stop request results in a command that is sent to the - /// background process that manages the subgraph instances. + /// background task that manages the subgraph instances. command_tx: mpsc::UnboundedSender, /// When a subgraph starts it is assigned a sequential ID. @@ -87,10 +87,10 @@ pub(super) struct Monitor { impl Monitor { /// Creates a new subgraph monitor. /// - /// Spawns a background process that manages the subgraph start and stop requests. + /// Spawns a background task that manages the subgraph start and stop requests. /// /// A new cancel token is derived from the `cancel_token` and only the derived token is used by the - /// subgraph monitor and its background process. + /// subgraph monitor and its background task. pub(super) fn new(logger_factory: &LoggerFactory, cancel_token: &CancellationToken) -> Self { let logger = logger_factory.component_logger("AmpSubgraphMonitor", None); let logger_factory = Arc::new(logger_factory.with_parent(logger)); @@ -394,9 +394,9 @@ impl Monitor { } } - /// Spawns a background process that executes the subgraph runner future. + /// Spawns a background task that executes the subgraph runner future. /// - /// An additional background process is spawned to handle the graceful shutdown of the subgraph runner, + /// An additional background task is spawned to handle the graceful shutdown of the subgraph runner, /// and to ensure correct behaviour even if the subgraph runner panics. fn start_subgraph( logger: Logger, @@ -502,7 +502,7 @@ impl Drop for Monitor { } } -/// Represents a background process that executes the subgraph runner future. +/// Represents a background task that executes the subgraph runner future. struct SubgraphInstance { id: u32, handle: JoinHandle<()>, diff --git a/core/src/amp_subgraph/runner/compat.rs b/core/src/amp_subgraph/runner/compat.rs index c695238416c..f7152965cf3 100644 --- a/core/src/amp_subgraph/runner/compat.rs +++ b/core/src/amp_subgraph/runner/compat.rs @@ -1,3 +1,5 @@ +//! This is a temporary compatibility module until the graph-node is fully migrated to `alloy`. + use alloy::primitives::{BlockHash, BlockNumber}; use chrono::{DateTime, Utc}; diff --git a/core/src/amp_subgraph/runner/data_processing.rs b/core/src/amp_subgraph/runner/data_processing.rs index 2801abc66aa..721b677459c 100644 --- a/core/src/amp_subgraph/runner/data_processing.rs +++ b/core/src/amp_subgraph/runner/data_processing.rs @@ -103,8 +103,14 @@ async fn process_record_batch_group( return Ok(entity_cache); } - let block_timestamp = decode_block_timestamp(&record_batches) - .map_err(|e| e.context("failed to decode block timestamp"))?; + let block_timestamp = if cx.manifest.schema.has_aggregations() { + decode_block_timestamp(&record_batches) + .map_err(|e| e.context("failed to decode block timestamp"))? + } else { + // TODO: Block timestamp is only required for subgraph aggregations. + // Make it optional at the store level. + DateTime::::MIN_UTC + }; for record_batch in record_batches { let StreamRecordBatch { @@ -231,6 +237,14 @@ async fn process_record_batch( Ok(()) } +/// Decodes the block timestamp from the first matching column in `record_batches`. +/// +/// Iterates through the provided record batches and returns the timestamp from +/// the first batch that contains a valid block timestamp column. +/// +/// # Preconditions +/// +/// All entries in `record_batches` must belong to the same record batch group. fn decode_block_timestamp(record_batches: &[StreamRecordBatch]) -> Result, Error> { let mut last_error: Option = None; diff --git a/core/src/amp_subgraph/runner/mod.rs b/core/src/amp_subgraph/runner/mod.rs index 8fee0e9fda4..a320c2d21b0 100644 --- a/core/src/amp_subgraph/runner/mod.rs +++ b/core/src/amp_subgraph/runner/mod.rs @@ -9,7 +9,7 @@ mod reorg_handler; use std::time::{Duration, Instant}; use anyhow::Result; -use futures::{future::BoxFuture, StreamExt}; +use futures::StreamExt; use graph::{ amp::Client, cheap_clone::CheapClone, components::store::EntityCache, data::subgraph::schema::SubgraphError, @@ -24,48 +24,45 @@ use self::{ pub(super) use self::context::Context; -pub(super) fn new_runner( +pub(super) async fn new_runner( mut cx: Context, -) -> Box BoxFuture<'static, Result<()>> + Send + 'static> + cancel_token: CancellationToken, +) -> Result<()> where AC: Client + Send + Sync + 'static, { - Box::new(move |cancel_token| { - Box::pin(async move { - let indexing_duration_handle = tokio::spawn({ - let mut instant = Instant::now(); - let indexing_duration = cx.metrics.indexing_duration.clone(); - - async move { - loop { - tokio::time::sleep(Duration::from_secs(1)).await; - - let prev_instant = std::mem::replace(&mut instant, Instant::now()); - indexing_duration.record(prev_instant.elapsed()); - } - } - }); - - let result = cancel_token - .run_until_cancelled(run_indexing_with_retries(&mut cx)) - .await; + let indexing_duration_handle = tokio::spawn({ + let mut instant = Instant::now(); + let indexing_duration = cx.metrics.indexing_duration.clone(); - indexing_duration_handle.abort(); + async move { + loop { + tokio::time::sleep(Duration::from_secs(1)).await; - match result { - Some(result) => result?, - None => { - debug!(cx.logger, "Processed cancel signal"); - } + let prev_instant = std::mem::replace(&mut instant, Instant::now()); + indexing_duration.record(prev_instant.elapsed()); } + } + }); + + let result = cancel_token + .run_until_cancelled(run_indexing_with_retries(&mut cx)) + .await; + + indexing_duration_handle.abort(); + + match result { + Some(result) => result?, + None => { + debug!(cx.logger, "Processed cancel signal"); + } + } - cx.metrics.deployment_status.stopped(); + cx.metrics.deployment_status.stopped(); - debug!(cx.logger, "Waiting for the store to finish processing"); - cx.store.flush().await?; - Ok(()) - }) - }) + debug!(cx.logger, "Waiting for the store to finish processing"); + cx.store.flush().await?; + Ok(()) } async fn run_indexing(cx: &mut Context) -> Result<(), Error> diff --git a/docs/amp-powered-subgraphs.md b/docs/amp-powered-subgraphs.md index 26255a14938..91012e2d328 100644 --- a/docs/amp-powered-subgraphs.md +++ b/docs/amp-powered-subgraphs.md @@ -1,7 +1,7 @@ # Amp-powered subgraphs > [!NOTE] -> This features is available starting from spec version `1.4.0` +> This features is available starting from spec version `1.5.0` Amp-powered subgraphs are a new kind of subgraphs with SQL data sources that query and index data from the Amp servers. They are significantly more efficient than the standard subgraphs, and the indexing time can be reduced from days and weeks, @@ -19,14 +19,27 @@ Amp-powered subgraphs introduce a new structure for defining Amp subgraph data s ### Spec version -The minimum spec version for Amp-powered subgraphs is `1.4.0`. +The minimum spec version for Amp-powered subgraphs is `1.5.0`.
Example YAML: -```yaml -specVersion: 1.4.0 -# .. other fields ... +```diff ++ specVersion: 1.5.0 + dataSources: + - kind: amp + name: Transfers + network: ethereum-mainnet + source: + dataset: edgeandnode/ethereum_mainnet + tables: + - blocks + - transactions + transformer: + apiVersion: 0.0.1 + tables: + - name: Transfer + file: ```
@@ -40,10 +53,22 @@ This is used to assign the subgraph to the appropriate indexing process.
Example YAML: -```yaml -dataSources: - - kind: amp - # .. other fields ... +```diff + specVersion: 1.5.0 ++ dataSources: ++ - kind: amp + name: Transfers + network: ethereum-mainnet + source: + dataset: edgeandnode/ethereum_mainnet + tables: + - blocks + - transactions + transformer: + apiVersion: 0.0.1 + tables: + - name: Transfer + file: ```
@@ -55,10 +80,22 @@ This name is used for observability purposes and to identify progress and potent
Example YAML: -```yaml -dataSources: - - name: Transfers - # .. other fields ... +```diff + specVersion: 1.5.0 ++ dataSources: + - kind: amp ++ name: Transfers + network: ethereum-mainnet + source: + dataset: edgeandnode/ethereum_mainnet + tables: + - blocks + - transactions + transformer: + apiVersion: 0.0.1 + tables: + - name: Transfer + file: ```
@@ -73,10 +110,22 @@ This is used to validate that the SQL queries for this data source produce resul
Example YAML: -```yaml -dataSources: - - network: ethereum-mainnet - # .. other fields ... +```diff + specVersion: 1.5.0 ++ dataSources: + - kind: amp + name: Transfers ++ network: ethereum-mainnet + source: + dataset: edgeandnode/ethereum_mainnet + tables: + - blocks + - transactions + transformer: + apiVersion: 0.0.1 + tables: + - name: Transfer + file: ```
@@ -92,11 +141,22 @@ This is used to validate that the SQL queries for this data source only query th
Example YAML: -```yaml -dataSources: - - source: - dataset: edgeandnode/ethereum_mainnet - # .. other fields ... +```diff + specVersion: 1.5.0 ++ dataSources: + - kind: amp + name: Transfers + network: ethereum-mainnet ++ source: ++ dataset: edgeandnode/ethereum_mainnet + tables: + - blocks + - transactions + transformer: + apiVersion: 0.0.1 + tables: + - name: Transfer + file: ```
@@ -108,17 +168,26 @@ This is used to validate that the SQL queries for this data source only query th
Example YAML: -```yaml -dataSources: - - source: - tables: - - blocks - - transactions - # .. other fields ... +```diff + specVersion: 1.5.0 ++ dataSources: + - kind: amp + name: Transfers + network: ethereum-mainnet ++ source: + dataset: edgeandnode/ethereum_mainnet ++ tables: ++ - blocks ++ - transactions + transformer: + apiVersion: 0.0.1 + tables: + - name: Transfer + file: ```
-### `source.address` +### Optional `source.address` Contains the contract address with which SQL queries in the data source interact. @@ -128,15 +197,27 @@ SQL queries resolve `sg_source_address()` calls to this contract address.
Example YAML: -```yaml -dataSources: - - source: - address: "0xc944E90C64B2c07662A292be6244BDf05Cda44a7" - # .. other fields ... +```diff + specVersion: 1.5.0 ++ dataSources: + - kind: amp + name: Transfers + network: ethereum-mainnet ++ source: ++ address: "0xc944E90C64B2c07662A292be6244BDf05Cda44a7" + dataset: edgeandnode/ethereum_mainnet + tables: + - blocks + - transactions + transformer: + apiVersion: 0.0.1 + tables: + - name: Transfer + file: ```
-### `source.startBlock` +### Optional `source.startBlock` Contains the minimum block number that SQL queries in the data source can query. This is used as a starting point for the indexing process. @@ -146,15 +227,27 @@ _When not provided, defaults to block number `0`._
Example YAML: -```yaml -dataSources: - - source: - startBlock: 11446769 - # .. other fields ... +```diff + specVersion: 1.5.0 ++ dataSources: + - kind: amp + name: Transfers + network: ethereum-mainnet ++ source: ++ startBlock: 11446769 + dataset: edgeandnode/ethereum_mainnet + tables: + - blocks + - transactions + transformer: + apiVersion: 0.0.1 + tables: + - name: Transfer + file: ```
-### `source.endBlock` +### Optional `source.endBlock` Contains the maximum block number that SQL queries in the data source can query. Reaching this block number will complete the indexing process. @@ -164,11 +257,23 @@ _When not provided, defaults to the maximum possible block number._
Example YAML: -```yaml -dataSources: - - source: - endBlock: 23847939 - # .. other fields ... +```diff + specVersion: 1.5.0 ++ dataSources: + - kind: amp + name: Transfers + network: ethereum-mainnet ++ source: ++ endBlock: 23847939 + dataset: edgeandnode/ethereum_mainnet + tables: + - blocks + - transactions + transformer: + apiVersion: 0.0.1 + tables: + - name: Transfer + file: ```
@@ -186,15 +291,27 @@ Represents the version of this transformer. Each version may contain a different
Example YAML: -```yaml -dataSource: - - transformer: - apiVersion: 0.0.1 - # .. other fields ... +```diff + specVersion: 1.5.0 ++ dataSources: + - kind: amp + name: Transfers + network: ethereum-mainnet + source: + endBlock: 23847939 + dataset: edgeandnode/ethereum_mainnet + tables: + - blocks + - transactions ++ transformer: ++ apiVersion: 0.0.1 + tables: + - name: Transfers + file: ```
-### `transformer.abis` +### Optional `transformer.abis` Contains a list of ABIs that SQL queries can reference to extract event signatures. @@ -206,13 +323,26 @@ _When not provided, defaults to an empty list._
Example YAML: -```yaml -dataSource: - - transformer: - abis: - - name: ERC721 # The name of the contract - file: - # .. other fields ... +```diff + specVersion: 1.5.0 ++ dataSources: + - kind: amp + name: Transfers + network: ethereum-mainnet + source: + endBlock: 23847939 + dataset: edgeandnode/ethereum_mainnet + tables: + - blocks + - transactions ++ transformer: ++ abis: ++ - name: ERC721 # The name of the contract ++ file: + apiVersion: 0.0.1 + tables: + - name: Transfer + file: ```
@@ -238,12 +368,23 @@ type Block @entity(immutable: true) { ``` **YAML manifest:** -```yaml -dataSource: - - transformer: - tables: - - name: Block - # .. other fields ... +```diff + specVersion: 1.5.0 ++ dataSources: + - kind: amp + name: Blocks + network: ethereum-mainnet + source: + endBlock: 23847939 + dataset: edgeandnode/ethereum_mainnet + tables: + - blocks + - transactions ++ transformer: + apiVersion: 0.0.1 ++ tables: ++ - name: Block + file: ``` @@ -260,12 +401,23 @@ _When not provided, the `file` field is used instead._
Example YAML: -```yaml -dataSource: - - transformer: - tables: - - query: SELECT * FROM "edgeandnode/ethereum_mainnet".blocks; - # .. other fields ... +```diff + specVersion: 1.5.0 ++ dataSources: + - kind: amp + name: Blocks + network: ethereum-mainnet + source: + endBlock: 23847939 + dataset: edgeandnode/ethereum_mainnet + tables: + - blocks + - transactions ++ transformer: + apiVersion: 0.0.1 ++ tables: + - name: Block ++ query: SELECT * FROM "edgeandnode/ethereum_mainnet".blocks; ```
@@ -281,12 +433,23 @@ _When not provided, the `query` field is used instead._
Example YAML: -```yaml -dataSource: - - transformer: - tables: - - file: - # .. other fields ... +```diff + specVersion: 1.5.0 ++ dataSources: + - kind: amp + name: Blocks + network: ethereum-mainnet + source: + endBlock: 23847939 + dataset: edgeandnode/ethereum_mainnet + tables: + - blocks + - transactions ++ transformer: + apiVersion: 0.0.1 ++ tables: + - name: Block ++ file: ```
@@ -325,6 +488,9 @@ Example SQL query: `SELECT hash, /* .. other projections .. */ FROM "edgeandnode ### Block timestamps +> [!NOTE] +> Only required for Amp-powered subgraphs that use subgraph aggregations. + Every SQL query in Amp-powered subgraphs is expected to return the block timestamps for every row. This is required because subgraphs rely on this information for storing subgraph entities. @@ -384,24 +550,10 @@ Amp-powered subgraphs feature introduces the following new ENV variables: ## Metrics -Amp-powered subgraphs feature introduces the following new metrics: - -- `amp_deployment_status` – Indicates the current indexing status of a deployment. +In addition to reporting updates to the existing `deployment_status`, `deployment_head`, `deployment_synced` and `deployment_blocks_processed_count` +metrics, Amp-powered subgraphs feature introduces the following new metrics: - **Possible values:** - - `1` - graph-node is preparing to start indexing; - - `2` - deployment is being indexed; - - `3` - indexing is stopped by request; - - `4` - indexing failed; -- `amp_deployment_head` – Tracks the most recent block number processed by a deployment. - `amp_deployment_target` – Tracks the target block number of a deployment. -- `amp_deployment_synced` – Indicates whether a deployment has reached the chain head or the end block since it was deployed. - - **Possible values:** - - `0` - deployment is not synced; - - `1` - deployment is synced; - `amp_deployment_indexing_duration_seconds` – Tracks the total duration in seconds of deployment indexing. -- `amp_deployment_blocks_processed_count` – Tracks the total number of blocks processed by a deployment. - Additionally, the `deployment_sync_secs` is extended with a new `amp-process` stage and new sections specific to the Amp indexing process. diff --git a/graph/src/amp/client/flight_client.rs b/graph/src/amp/client/flight_client.rs index 588f1a97762..d2046b96e29 100644 --- a/graph/src/amp/client/flight_client.rs +++ b/graph/src/amp/client/flight_client.rs @@ -138,7 +138,7 @@ impl Client for FlightClient { prev_block_ranges = resume_streaming_query .iter() .cloned() - .map(Into::into) + .map(BlockRange::from) .collect(); let metadata = serialize_resume_streaming_query(resume_streaming_query); diff --git a/graph/src/amp/common/mod.rs b/graph/src/amp/common/mod.rs index d98fbea3b1b..9c4f5e71813 100644 --- a/graph/src/amp/common/mod.rs +++ b/graph/src/amp/common/mod.rs @@ -1,3 +1,4 @@ +// TODO: Remove this once there is a better way to get this information from Amp servers. pub(super) mod column_aliases { pub(in crate::amp) static BLOCK_NUMBER: &[&str] = &[ "_block_num", // Meta column present in all tables diff --git a/graph/src/amp/manifest/data_source/raw.rs b/graph/src/amp/manifest/data_source/raw.rs index 4f4d8cef71d..d848c944318 100644 --- a/graph/src/amp/manifest/data_source/raw.rs +++ b/graph/src/amp/manifest/data_source/raw.rs @@ -25,6 +25,7 @@ use crate::{ }, components::link_resolver::{LinkResolver, LinkResolverContext}, data::subgraph::DeploymentHash, + schema::InputSchema, }; /// Supported API versions for data source transformers. @@ -64,6 +65,7 @@ impl RawDataSource { logger: &Logger, link_resolver: &dyn LinkResolver, amp_client: &impl amp::Client, + input_schema: Option<&InputSchema>, ) -> Result { let Self { name, @@ -84,7 +86,14 @@ impl RawDataSource { .map_err(|e| e.source_context("invalid `source`"))?; let transformer = transformer - .resolve(&logger, link_resolver, amp_client, &network, &source) + .resolve( + &logger, + link_resolver, + amp_client, + input_schema, + &network, + &source, + ) .await .map_err(|e| e.source_context("invalid `transformer`"))?; @@ -222,6 +231,7 @@ impl RawTransformer { logger: &Logger, link_resolver: &dyn LinkResolver, amp_client: &impl amp::Client, + input_schema: Option<&InputSchema>, network: &str, source: &Source, ) -> Result { @@ -237,6 +247,7 @@ impl RawTransformer { logger, link_resolver, amp_client, + input_schema, network, tables, source, @@ -294,6 +305,7 @@ impl RawTransformer { logger: &Logger, link_resolver: &dyn LinkResolver, amp_client: &impl amp::Client, + input_schema: Option<&InputSchema>, network: &str, tables: Vec, source: &Source, @@ -318,7 +330,15 @@ impl RawTransformer { ); table - .resolve(&logger, link_resolver, amp_client, network, source, abis) + .resolve( + &logger, + link_resolver, + amp_client, + input_schema, + network, + source, + abis, + ) .await .map_err(|e| e.source_context(format!("invalid `tables` at index {i}"))) }); @@ -410,6 +430,7 @@ impl RawTable { logger: &Logger, link_resolver: &dyn LinkResolver, amp_client: &impl amp::Client, + input_schema: Option<&InputSchema>, network: &str, source: &Source, abis: &[Abi], @@ -437,6 +458,7 @@ impl RawTable { let block_range_query_builder = Self::resolve_block_range_query_builder( logger, amp_client, + input_schema, network, source, query, @@ -534,6 +556,7 @@ impl RawTable { async fn resolve_block_range_query_builder( logger: &Logger, amp_client: &impl amp::Client, + input_schema: Option<&InputSchema>, network: &str, source: &Source, query: ValidQuery, @@ -545,10 +568,13 @@ impl RawTable { let (block_number_column, _) = auto_block_number_decoder(&record_batch).map_err(|e| Error::InvalidQuery(e))?; - let has_block_hash_column = auto_block_hash_decoder(&record_batch).is_ok(); - let has_block_timestamp_column = auto_block_timestamp_decoder(&record_batch).is_ok(); + let need_block_hash_column = auto_block_hash_decoder(&record_batch).is_err(); + let need_block_timestamp_column = input_schema + .map(|input_schema| input_schema.has_aggregations()) + .unwrap_or(false) + && auto_block_timestamp_decoder(&record_batch).is_err(); - if has_block_hash_column && has_block_timestamp_column { + if !need_block_hash_column && !need_block_timestamp_column { return Ok(BlockRangeQueryBuilder::new(query, block_number_column)); } @@ -590,7 +616,7 @@ impl RawTable { let record_batch = RecordBatch::new_empty(schema.clone().into()); let mut columns = Vec::new(); - if !has_block_hash_column { + if need_block_hash_column { let Ok((block_hash_column, _)) = auto_block_hash_decoder(&record_batch) else { debug!( context_logger, @@ -602,7 +628,7 @@ impl RawTable { columns.push(block_hash_column); } - if !has_block_timestamp_column { + if need_block_timestamp_column { let Ok((block_timestamp_column, _)) = auto_block_timestamp_decoder(&record_batch) else { debug!( diff --git a/graph/src/data/subgraph/mod.rs b/graph/src/data/subgraph/mod.rs index 6d893be55cc..f8047c0a807 100644 --- a/graph/src/data/subgraph/mod.rs +++ b/graph/src/data/subgraph/mod.rs @@ -1143,6 +1143,24 @@ impl UnresolvedSubgraphManifest { ); } + let schema = match unresolved_schema { + Some(unresolved_schema) => Some( + unresolved_schema + .resolve( + deployment_hash, + &spec_version, + id.cheap_clone(), + resolver, + logger, + ) + .await?, + ), + None => { + // It is attempted to be auto-generated after data sources are resolved. + None + } + }; + let data_sources = try_join_all(data_sources.into_iter().enumerate().map(|(idx, ds)| { ds.resolve( deployment_hash, @@ -1151,6 +1169,7 @@ impl UnresolvedSubgraphManifest { logger, idx as u32, &spec_version, + schema.as_ref(), ) })) .await?; @@ -1163,18 +1182,8 @@ impl UnresolvedSubgraphManifest { }) .collect_vec(); - let schema = match unresolved_schema { - Some(unresolved_schema) => { - unresolved_schema - .resolve( - deployment_hash, - &spec_version, - id.cheap_clone(), - resolver, - logger, - ) - .await? - } + let schema = match schema { + Some(schema) => schema, None if amp_data_sources.len() == data_sources.len() => { let table_schemas = amp_data_sources .iter() diff --git a/graph/src/data_source/mod.rs b/graph/src/data_source/mod.rs index e1598e2f0df..d33c6e41560 100644 --- a/graph/src/data_source/mod.rs +++ b/graph/src/data_source/mod.rs @@ -363,6 +363,7 @@ impl UnresolvedDataSource { logger: &Logger, manifest_idx: u32, spec_version: &semver::Version, + input_schema: Option<&InputSchema>, ) -> Result, anyhow::Error> { match self { Self::Onchain(unresolved) => unresolved @@ -394,7 +395,7 @@ impl UnresolvedDataSource { } Self::Amp(raw_data_source) => match amp_client { Some(amp_client) => raw_data_source - .resolve(logger, resolver.as_ref(), amp_client.as_ref()) + .resolve(logger, resolver.as_ref(), amp_client.as_ref(), input_schema) .await .map(DataSource::Amp) .map_err(Error::from),