From b9370ea35c1621cfa1db8f9046176593c6b44cc0 Mon Sep 17 00:00:00 2001 From: Siegfried Weber Date: Sat, 1 Nov 2025 12:13:10 +0100 Subject: [PATCH 1/5] docs: Describe scaling of OpenSearch clusters --- .../opensearch/pages/usage-guide/scaling.adoc | 255 ++++++++++++++++++ .../storage-resource-configuration.adoc | 2 +- docs/modules/opensearch/partials/nav.adoc | 1 + 3 files changed, 257 insertions(+), 1 deletion(-) create mode 100644 docs/modules/opensearch/pages/usage-guide/scaling.adoc diff --git a/docs/modules/opensearch/pages/usage-guide/scaling.adoc b/docs/modules/opensearch/pages/usage-guide/scaling.adoc new file mode 100644 index 0000000..6723ada --- /dev/null +++ b/docs/modules/opensearch/pages/usage-guide/scaling.adoc @@ -0,0 +1,255 @@ += Scaling OpenSearch clusters +:description: OpenSearch clusters can be scaled after provisioning but manual steps are required. + +OpenSearch clusters can be scaled after provisioning. +CPU and memory settings can be easily adapted as described in xref:opensearch:usage-guide/storage-resource-configuration.adoc#_resource_requests[Resource Requests]. +But for changing the number of nodes or resizing the volumes, the following points must be noted. + +Horizontal scaling, i.e. changing the replica count of role-groups, can be easily done for non-data nodes by adapting the OpenSearchCluster specification. +The number of data nodes can also be increased. +But decreasing the number of data nodes requires manual steps because if a pod which manages data is just shut down then its data is not reachable anymore. +Manual steps are required to drain the data from the nodes before removing them. + +Vertical scaling, i.e. changing the volume size of nodes, is not supported by the operator. +If the size of a volume can be changed depends on its CSI driver. +OpenSearch supports multiple data paths in one data node, but adding volumes in additional data paths usually does not solve the problem of low disk space because the data is not rebalanced across multiple data paths. + +[NOTE] +==== +The OpenSearch operator is still in an early stage and as development progresses, smart scaling (adapting resources without data loss) and auto scaling (scaling the cluster according to the load) will be eventually supported by the operator. +==== + +== Manually scaling + +As mentioned above, scaling is quite a demanding task but there exists an easy workaround which will be presented here. + +For instance, the following OpenSearchCluster with three custer-manager nodes and five small data nodes is already deployed: + +[source,yaml] +---- +spec: + nodes: + roleGroups: + cluster-manager: + config: + nodeRoles: + - cluster_manager + replicas: 3 + data-small: + config: + nodeRoles: + - data + - ingest + - remote_cluster_client + resources: + storage: + data: + capacity: 10Gi + replicas: 5 +---- + +Now, you decide that instead of five small data nodes, three large data nodes are better suited. +To achieve this, you can replace the role-group `data-small` with a desired one. + +First, add the new role-group `data-large` with three replicas and a capacity of 100Gi per node: + +[source,yaml] +---- +spec: + nodes: + roleGroups: + cluster-manager: + config: + nodeRoles: + - cluster_manager + replicas: 3 + data-small: + config: + nodeRoles: + - data + - ingest + - remote_cluster_client + resources: + storage: + data: + capacity: 10Gi + replicas: 5 + data-large: + config: + nodeRoles: + - data + - ingest + - remote_cluster_client + resources: + storage: + data: + capacity: 100Gi + replicas: 3 +---- + +The data must now be moved from `data-small` to `data-large`. +With the cluster setting `cluster.routing.allocation.exclude`, nodes can be excluded from shard allocation. +If you did not disable the rebalancing, then existing data will be moved from the specified nodes to the allowed ones, in the example case from `data-small` to `data-large`. + +[TIP] +==== +The OpenSearch operator adds a role-group attribute to every OpenSearch node, so that it is easier to reference all nodes belonging to a role-group. +==== + +The following REST call excludes the role-group `data-small` from the shard allocation: + +[source,http] +---- +PUT _cluster/settings +{ + "persistent": { + "cluster": { + "routing": { + "allocation.exclude": { + "role-group": "data-small" + } + } + } + } +} +---- + +You have to wait now until all the data has been moved from `data-small` to `data-large`. +The current shard allocation can be requested at the `_cat/shards` endpoint, e.g.: + +[source,http] +---- +GET _cat/shards?v +index shard prirep state docs store ip node +logs 0 r STARTED 14074 6.9mb 10.244.0.60 opensearch-nodes-data-large-2 +logs 0 p RELOCATING 14074 8.5mb 10.244.0.52 opensearch-nodes-data-small-4 + -> 10.244.0.59 NFjQBBmWSm-pijXcxrXnvQ opensearch-nodes-data-large-1 +... + +GET _cat/shards?v +index shard prirep state docs store ip node +logs 0 r STARTED 14074 6.9mb 10.244.0.60 opensearch-nodes-data-large-2 +logs 0 p STARTED 14074 6.9mb 10.244.0.59 opensearch-nodes-data-large-1 +... +---- + +The statistics, especially the document count, can be retrieved at the `_nodes/role-group:data-small/stats` endpoint, e.g.: + +[source,http] +---- +GET _nodes/role-group:data-small/stats/indices/docs +{ + "_nodes": { + "total": 5, + "successful": 5, + "failed": 0 + }, + "cluster_name": "opensearch", + "nodes": { + "wjaeQJUXQX6eNWYUeiScgQ": { + "timestamp": 1761992580239, + "name": "opensearch-nodes-data-small-4", + "transport_address": "10.244.0.52:9300", + "host": "10.244.0.52", + "ip": "10.244.0.52:9300", + "roles": [ + "data", + "ingest", + "remote_cluster_client" + ], + "attributes": { + "role-group": "data-small", + "shard_indexing_pressure_enabled": "true" + }, + "indices": { + "docs": { + "count": 14686, + "deleted": 0 + } + } + }, + ... + } +} + +GET _nodes/role-group:data-small/stats/indices/docs +{ + "_nodes": { + "total": 5, + "successful": 5, + "failed": 0 + }, + "cluster_name": "opensearch", + "nodes": { + "wjaeQJUXQX6eNWYUeiScgQ": { + "timestamp": 1761992817422, + "name": "opensearch-nodes-data-small-4", + "transport_address": "10.244.0.52:9300", + "host": "10.244.0.52", + "ip": "10.244.0.52:9300", + "roles": [ + "data", + "ingest", + "remote_cluster_client" + ], + "attributes": { + "role-group": "data-small", + "shard_indexing_pressure_enabled": "true" + }, + "indices": { + "docs": { + "count": 0, + "deleted": 0 + } + } + }, + ... + } +} + +---- + +When all shards were transferred, the role-group `data-small` can just be removed from the OpenSearchCluster specification: + +[source,yaml] +---- +spec: + nodes: + roleGroups: + cluster-manager: + config: + nodeRoles: + - cluster_manager + replicas: 3 + data-large: + config: + nodeRoles: + - data + - ingest + - remote_cluster_client + resources: + storage: + data: + capacity: 100Gi + replicas: 3 +---- + +Finally, the shard exclusion should be removed again from the cluster settings: + +[source,http] +---- +PUT _cluster/settings +{ + "persistent": { + "cluster": { + "routing": { + "allocation.exclude": { + "role-group": null + } + } + } + } +} +---- + +If your OpenSearch clients only used the service of the cluster-manager nodes to connect to the cluster, the switch from one to another data role-group should have been transparent for them. diff --git a/docs/modules/opensearch/pages/usage-guide/storage-resource-configuration.adoc b/docs/modules/opensearch/pages/usage-guide/storage-resource-configuration.adoc index cc86873..890cc96 100644 --- a/docs/modules/opensearch/pages/usage-guide/storage-resource-configuration.adoc +++ b/docs/modules/opensearch/pages/usage-guide/storage-resource-configuration.adoc @@ -13,7 +13,7 @@ nodes: config: resources: storage: - logDirs: + data: capacity: 50Gi ---- diff --git a/docs/modules/opensearch/partials/nav.adoc b/docs/modules/opensearch/partials/nav.adoc index 0b47320..1994a6e 100644 --- a/docs/modules/opensearch/partials/nav.adoc +++ b/docs/modules/opensearch/partials/nav.adoc @@ -9,6 +9,7 @@ ** xref:opensearch:usage-guide/monitoring.adoc[] ** xref:opensearch:usage-guide/logging.adoc[] ** xref:opensearch:usage-guide/opensearch-dashboards.adoc[] +** xref:opensearch:usage-guide/scaling.adoc[] ** xref:opensearch:usage-guide/operations/index.adoc[] *** xref:opensearch:usage-guide/operations/cluster-operations.adoc[] *** xref:opensearch:usage-guide/operations/pod-placement.adoc[] From e09c8e8d8cfc495919c4731fd09ad67c762a3ac9 Mon Sep 17 00:00:00 2001 From: Siegfried Weber Date: Sat, 1 Nov 2025 12:28:05 +0100 Subject: [PATCH 2/5] feat: Add the role-group as a node attribute --- CHANGELOG.md | 2 ++ .../src/controller/build/node_config.rs | 17 ++++++++++++++++- .../src/controller/build/role_group_builder.rs | 1 + tests/templates/kuttl/smoke/10-assert.yaml.j2 | 2 ++ 4 files changed, 21 insertions(+), 1 deletion(-) diff --git a/CHANGELOG.md b/CHANGELOG.md index 1ccb0ef..1c164ab 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -27,6 +27,7 @@ All notable changes to this project will be documented in this file. - Helm: Allow Pod `priorityClassName` to be configured ([#34]). - Support log configuration and log aggregation ([#40]). - Ensure that the permissions of the configuration files are correct ([#47]). +- Add the role-group as a node attribute ([#63]). ### Changed @@ -41,3 +42,4 @@ All notable changes to this project will be documented in this file. [#40]: https://github.com/stackabletech/opensearch-operator/pull/40 [#47]: https://github.com/stackabletech/opensearch-operator/pull/47 [#58]: https://github.com/stackabletech/opensearch-operator/pull/58 +[#63]: https://github.com/stackabletech/opensearch-operator/pull/63 diff --git a/rust/operator-binary/src/controller/build/node_config.rs b/rust/operator-binary/src/controller/build/node_config.rs index db795f7..9249042 100644 --- a/rust/operator-binary/src/controller/build/node_config.rs +++ b/rust/operator-binary/src/controller/build/node_config.rs @@ -10,7 +10,7 @@ use crate::{ controller::OpenSearchRoleGroupConfig, crd::v1alpha1, framework::{ - ServiceName, + RoleGroupName, ServiceName, builder::pod::container::{EnvVarName, EnvVarSet}, role_group_utils, }, @@ -41,6 +41,10 @@ pub const CONFIG_OPTION_INITIAL_CLUSTER_MANAGER_NODES: &str = /// Type: string pub const CONFIG_OPTION_NETWORK_HOST: &str = "network.host"; +/// The custom node attribute "role-group" +/// Type: string +pub const CONFIG_OPTION_NODE_ATTR_ROLE_GROUP: &str = "node.attr.role-group"; + /// A descriptive name for the node. /// Type: string pub const CONFIG_OPTION_NODE_NAME: &str = "node.name"; @@ -61,6 +65,7 @@ pub const CONFIG_OPTION_PLUGINS_SECURITY_SSL_HTTP_ENABLED: &str = /// Configuration of an OpenSearch node based on the cluster and role-group configuration pub struct NodeConfig { cluster: ValidatedCluster, + role_group_name: RoleGroupName, role_group_config: OpenSearchRoleGroupConfig, discovery_service_name: ServiceName, } @@ -70,11 +75,13 @@ pub struct NodeConfig { impl NodeConfig { pub fn new( cluster: ValidatedCluster, + role_group_name: RoleGroupName, role_group_config: OpenSearchRoleGroupConfig, discovery_service_name: ServiceName, ) -> Self { Self { cluster, + role_group_name, role_group_config, discovery_service_name, } @@ -111,6 +118,10 @@ impl NodeConfig { CONFIG_OPTION_PLUGINS_SECURITY_NODES_DN.to_owned(), json!(["CN=generated certificate for pod".to_owned()]), ); + config.insert( + CONFIG_OPTION_NODE_ATTR_ROLE_GROUP.to_owned(), + json!(self.role_group_name), + ); for (setting, value) in self .role_group_config @@ -311,6 +322,8 @@ mod tests { let image: ProductImage = serde_json::from_str(r#"{"productVersion": "3.1.0"}"#) .expect("should be a valid ProductImage"); + let role_group_name = RoleGroupName::from_str_unsafe("data"); + let role_group_config = OpenSearchRoleGroupConfig { replicas: test_config.replicas, config: ValidatedOpenSearchConfig { @@ -374,6 +387,7 @@ mod tests { NodeConfig::new( cluster, + role_group_name, role_group_config, ServiceName::from_str_unsafe("my-opensearch-cluster-manager"), ) @@ -391,6 +405,7 @@ mod tests { "cluster.name: \"my-opensearch-cluster\"\n", "discovery.type: \"zen\"\n", "network.host: \"0.0.0.0\"\n", + "node.attr.role-group: \"data\"\n", "plugins.security.nodes_dn: [\"CN=generated certificate for pod\"]\n", "test: \"value\"" ) diff --git a/rust/operator-binary/src/controller/build/role_group_builder.rs b/rust/operator-binary/src/controller/build/role_group_builder.rs index 431690f..896c344 100644 --- a/rust/operator-binary/src/controller/build/role_group_builder.rs +++ b/rust/operator-binary/src/controller/build/role_group_builder.rs @@ -101,6 +101,7 @@ impl<'a> RoleGroupBuilder<'a> { cluster: cluster.clone(), node_config: NodeConfig::new( cluster.clone(), + role_group_name.clone(), role_group_config.clone(), discovery_service_name, ), diff --git a/tests/templates/kuttl/smoke/10-assert.yaml.j2 b/tests/templates/kuttl/smoke/10-assert.yaml.j2 index 43b8aa5..5739749 100644 --- a/tests/templates/kuttl/smoke/10-assert.yaml.j2 +++ b/tests/templates/kuttl/smoke/10-assert.yaml.j2 @@ -652,6 +652,7 @@ data: cluster.routing.allocation.disk.threshold_enabled: "false" discovery.type: "zen" network.host: "0.0.0.0" + node.attr.role-group: "cluster-manager" node.store.allow_mmap: "false" plugins.security.allow_default_init_securityindex: "true" plugins.security.nodes_dn: ["CN=generated certificate for pod"] @@ -687,6 +688,7 @@ data: cluster.routing.allocation.disk.threshold_enabled: "false" discovery.type: "zen" network.host: "0.0.0.0" + node.attr.role-group: "data" node.store.allow_mmap: "false" plugins.security.allow_default_init_securityindex: "true" plugins.security.nodes_dn: ["CN=generated certificate for pod"] From 16a25857b4d72d22945c7fbde6d745c09eb7239a Mon Sep 17 00:00:00 2001 From: Siegfried Weber Date: Thu, 13 Nov 2025 10:36:40 +0100 Subject: [PATCH 3/5] docs: Refine the text with GPT-4o mini --- .../opensearch/pages/usage-guide/scaling.adoc | 55 ++++++++++--------- 1 file changed, 29 insertions(+), 26 deletions(-) diff --git a/docs/modules/opensearch/pages/usage-guide/scaling.adoc b/docs/modules/opensearch/pages/usage-guide/scaling.adoc index 6723ada..8025acf 100644 --- a/docs/modules/opensearch/pages/usage-guide/scaling.adoc +++ b/docs/modules/opensearch/pages/usage-guide/scaling.adoc @@ -2,28 +2,31 @@ :description: OpenSearch clusters can be scaled after provisioning but manual steps are required. OpenSearch clusters can be scaled after provisioning. -CPU and memory settings can be easily adapted as described in xref:opensearch:usage-guide/storage-resource-configuration.adoc#_resource_requests[Resource Requests]. -But for changing the number of nodes or resizing the volumes, the following points must be noted. +CPU and memory settings can be easily adjusted, as detailed in the xref:opensearch:usage-guide/storage-resource-configuration.adoc#_resource_requests[Resource Requests]. +However, when changing the number of nodes or resizing volumes, the following considerations must be kept in mind. -Horizontal scaling, i.e. changing the replica count of role-groups, can be easily done for non-data nodes by adapting the OpenSearchCluster specification. -The number of data nodes can also be increased. -But decreasing the number of data nodes requires manual steps because if a pod which manages data is just shut down then its data is not reachable anymore. -Manual steps are required to drain the data from the nodes before removing them. +Horizontal scaling, which involves adjusting the replica count of role groups, can be easily accomplished for non-data nodes by modifying the OpenSearchCluster specification. +Additionally, the number of data nodes can be increased. +However, reducing the number of data nodes requires manual intervention. +If a pod that manages data is simply shut down, its data becomes inaccessible. +Therefore, it is necessary to manually drain the data from the nodes before removing them. -Vertical scaling, i.e. changing the volume size of nodes, is not supported by the operator. -If the size of a volume can be changed depends on its CSI driver. -OpenSearch supports multiple data paths in one data node, but adding volumes in additional data paths usually does not solve the problem of low disk space because the data is not rebalanced across multiple data paths. +Vertical scaling, which refers to changing the volume size of nodes, is not supported by the operator. +Whether the size of a volume can be changed depends on its CSI driver. +OpenSearch allows for multiple data paths within a single data node, but adding volumes to additional data paths typically does not resolve low disk space issues, as the data is not automatically rebalanced across multiple data paths. [NOTE] ==== -The OpenSearch operator is still in an early stage and as development progresses, smart scaling (adapting resources without data loss) and auto scaling (scaling the cluster according to the load) will be eventually supported by the operator. +The OpenSearch operator is currently in the early stages of development. +As progress continues, smart scaling (adapting resources without data loss) and auto scaling (scaling the cluster based on load) will eventually be supported. ==== == Manually scaling -As mentioned above, scaling is quite a demanding task but there exists an easy workaround which will be presented here. +As noted earlier, scaling can be quite challenging; +however, an easy workaround exists, which will be presented here. -For instance, the following OpenSearchCluster with three custer-manager nodes and five small data nodes is already deployed: +For example, the following OpenSearchCluster has been deployed with three cluster-manager nodes and five small data nodes: [source,yaml] ---- @@ -48,10 +51,10 @@ spec: replicas: 5 ---- -Now, you decide that instead of five small data nodes, three large data nodes are better suited. -To achieve this, you can replace the role-group `data-small` with a desired one. +You have decided that three large data nodes would be more suitable than five small ones. +To implement this change, you can replace the role group `data-small` with your preferred option. -First, add the new role-group `data-large` with three replicas and a capacity of 100Gi per node: +First, add the new role group `data-large` with three replicas, each having a capacity of 100 Gi per node: [source,yaml] ---- @@ -87,16 +90,16 @@ spec: replicas: 3 ---- -The data must now be moved from `data-small` to `data-large`. -With the cluster setting `cluster.routing.allocation.exclude`, nodes can be excluded from shard allocation. -If you did not disable the rebalancing, then existing data will be moved from the specified nodes to the allowed ones, in the example case from `data-small` to `data-large`. +The data must now be transferred from `data-small` to `data-large`. +By using the cluster setting `cluster.routing.allocation.exclude`, you can exclude nodes from shard allocation. +If rebalancing has not been disabled, existing data will automatically move from the specified nodes to the allowed ones—in this case, from `data-small` to `data-large`. [TIP] ==== -The OpenSearch operator adds a role-group attribute to every OpenSearch node, so that it is easier to reference all nodes belonging to a role-group. +The OpenSearch operator assigns a role group attribute to each OpenSearch node, making it easier to reference all nodes associated with a specific role group. ==== -The following REST call excludes the role-group `data-small` from the shard allocation: +The following REST call excludes the `data-small` role group from shard allocation: [source,http] ---- @@ -114,8 +117,8 @@ PUT _cluster/settings } ---- -You have to wait now until all the data has been moved from `data-small` to `data-large`. -The current shard allocation can be requested at the `_cat/shards` endpoint, e.g.: +You must wait until all data has been transferred from `data-small` to `data-large`. +You can request the current shard allocation at the `_cat/shards` endpoint, for example: [source,http] ---- @@ -133,7 +136,7 @@ logs 0 p STARTED 14074 6.9mb 10.244.0.59 opensearch-nodes-data-large ... ---- -The statistics, especially the document count, can be retrieved at the `_nodes/role-group:data-small/stats` endpoint, e.g.: +Statistics, particularly the document count, can be retrieved from the `_nodes/role-group:data-small/stats` endpoint, for example: [source,http] ---- @@ -209,7 +212,7 @@ GET _nodes/role-group:data-small/stats/indices/docs ---- -When all shards were transferred, the role-group `data-small` can just be removed from the OpenSearchCluster specification: +Once all shards have been transferred, the `data-small` role group can be removed from the OpenSearchCluster specification: [source,yaml] ---- @@ -234,7 +237,7 @@ spec: replicas: 3 ---- -Finally, the shard exclusion should be removed again from the cluster settings: +Finally, the shard exclusion should be removed from the cluster settings: [source,http] ---- @@ -252,4 +255,4 @@ PUT _cluster/settings } ---- -If your OpenSearch clients only used the service of the cluster-manager nodes to connect to the cluster, the switch from one to another data role-group should have been transparent for them. +If your OpenSearch clients connected to the cluster exclusively through the cluster-manager nodes, the switch from one data role group to another should have been seamless for them. From a291a180d35919462bb968330279cebc28c4c7f4 Mon Sep 17 00:00:00 2001 From: Siegfried Weber Date: Thu, 13 Nov 2025 10:46:04 +0100 Subject: [PATCH 4/5] docs: Do not announce future support of smart and auto scaling --- docs/modules/opensearch/pages/usage-guide/scaling.adoc | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/docs/modules/opensearch/pages/usage-guide/scaling.adoc b/docs/modules/opensearch/pages/usage-guide/scaling.adoc index 8025acf..275b467 100644 --- a/docs/modules/opensearch/pages/usage-guide/scaling.adoc +++ b/docs/modules/opensearch/pages/usage-guide/scaling.adoc @@ -18,7 +18,7 @@ OpenSearch allows for multiple data paths within a single data node, but adding [NOTE] ==== The OpenSearch operator is currently in the early stages of development. -As progress continues, smart scaling (adapting resources without data loss) and auto scaling (scaling the cluster based on load) will eventually be supported. +Smart scaling (adapting resources without data loss) and auto scaling (scaling the cluster based on load) are not supported. ==== == Manually scaling From 1610224159c7be1c77b9cc1482c5365de705d1f6 Mon Sep 17 00:00:00 2001 From: Siegfried Weber Date: Thu, 13 Nov 2025 11:02:26 +0100 Subject: [PATCH 5/5] chore: Fix the changelog --- CHANGELOG.md | 8 ++++++-- 1 file changed, 6 insertions(+), 2 deletions(-) diff --git a/CHANGELOG.md b/CHANGELOG.md index 519d944..52a6d74 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -4,6 +4,12 @@ All notable changes to this project will be documented in this file. ## [Unreleased] +### Added + +- Add the role group as a node attribute ([#63]). + +[#63]: https://github.com/stackabletech/opensearch-operator/pull/63 + ## [25.11.0] - 2025-11-07 ## [25.11.0-rc1] - 2025-11-06 @@ -31,7 +37,6 @@ All notable changes to this project will be documented in this file. - Helm: Allow Pod `priorityClassName` to be configured ([#34]). - Support log configuration and log aggregation ([#40]). - Ensure that the permissions of the configuration files are correct ([#47]). -- Add the role-group as a node attribute ([#63]). ### Changed @@ -46,4 +51,3 @@ All notable changes to this project will be documented in this file. [#40]: https://github.com/stackabletech/opensearch-operator/pull/40 [#47]: https://github.com/stackabletech/opensearch-operator/pull/47 [#58]: https://github.com/stackabletech/opensearch-operator/pull/58 -[#63]: https://github.com/stackabletech/opensearch-operator/pull/63