-
Notifications
You must be signed in to change notification settings - Fork 72
Add groupby split_out config options to dask-sql #286
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
Changes from all commits
a79c6d9
7dd5bae
9da285c
f8b4eb5
bd15fb3
b755bb2
696d47c
45edb63
666eb84
e4c8613
cdb7d62
57b199c
File filter
Filter by extension
Conversations
Jump to
Diff view
Diff view
There are no files selected for viewing
| Original file line number | Diff line number | Diff line change |
|---|---|---|
|
|
@@ -230,3 +230,88 @@ def __init__(self, name: str): | |
| self.models: Dict[str, Tuple[Any, List[str]]] = {} | ||
| self.functions: Dict[str, UDF] = {} | ||
| self.function_lists: List[FunctionDescription] = [] | ||
| self.config: ConfigContainer = ConfigContainer() | ||
|
|
||
|
|
||
| class ConfigContainer: | ||
| """ | ||
| Helper class that contains configuration options required for specific operations | ||
| Configurations are stored in a dictionary where keys strings are delimited by `.` | ||
| for easier nested access of multiple configurations | ||
| Example: | ||
| Dask groupby aggregate operations can be configured via with the `split_out` option | ||
| to determine number of output partitions or the `split_every` option to determine | ||
| the number of partitions used during the groupby tree reduction step. | ||
| """ | ||
|
|
||
| def __init__(self): | ||
| self.config_dict = { | ||
| # Do not set defaults here unless needed | ||
| # This mantains the list of configuration options supported that can be set | ||
| # "dask.groupby.aggregate.split_out": 1, | ||
| # "dask.groupby.aggregate.split_every": None, | ||
| } | ||
|
|
||
| def set_config(self, config_options: Union[Tuple[str, Any], Dict[str, Any]]): | ||
| """ | ||
| Accepts either a tuple of (config, val) or a dictionary containing multiple | ||
| {config1: val1, config2: val2} pairs and updates the schema config with these values | ||
| """ | ||
| if isinstance(config_options, tuple): | ||
| config_options = [config_options] | ||
| self.config_dict.update(config_options) | ||
|
|
||
| def drop_config(self, config_strs: Union[str, List[str]]): | ||
| if isinstance(config_strs, str): | ||
| config_strs = [config_strs] | ||
| for config_key in config_strs: | ||
| self.config_dict.pop(config_key) | ||
|
|
||
| def get_config_by_prefix(self, config_prefix: str): | ||
| """ | ||
| Returns all configuration options matching the prefix in `config_prefix` | ||
|
|
||
| Example: | ||
| .. code-block:: python | ||
|
|
||
| from dask_sql.datacontainer import ConfigContainer | ||
|
|
||
| sql_config = ConfigContainer() | ||
| sql_config.set_config( | ||
| { | ||
| "dask.groupby.aggregate.split_out":1, | ||
| "dask.groupby.aggregate.split_every": 1, | ||
| "dask.sort.persist": True, | ||
| } | ||
| ) | ||
|
|
||
| sql_config.get_config_by_prefix("dask.groupby") | ||
| # Returns { | ||
| # "dask.groupby.aggregate.split_out": 1, | ||
| # "dask.groupby.aggregate.split_every": 1 | ||
| # } | ||
|
|
||
| sql_config.get_config_by_prefix("dask") | ||
| # Returns { | ||
| # "dask.groupby.aggregate.split_out": 1, | ||
| # "dask.groupby.aggregate.split_every": 1, | ||
| # "dask.sort.persist": True | ||
| # } | ||
|
|
||
| sql_config.get_config_by_prefix("dask.sort") | ||
| # Returns {"dask.sort.persist": True} | ||
|
|
||
| sql_config.get_config_by_prefix("missing.key") | ||
| sql_config.get_config_by_prefix(None) | ||
| # Both return {} | ||
|
|
||
| """ | ||
| return ( | ||
| { | ||
| key: val | ||
charlesbluca marked this conversation as resolved.
Show resolved
Hide resolved
|
||
| for key, val in self.config_dict.items() | ||
| if key.startswith(config_prefix) | ||
|
Comment on lines
+311
to
+313
Collaborator
Author
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. This may not be the most efficient way to do this, but should be okay for small dictionaries. |
||
| } | ||
| if config_prefix | ||
| else {} | ||
| ) | ||
| Original file line number | Diff line number | Diff line change | ||||||||
|---|---|---|---|---|---|---|---|---|---|---|
|
|
@@ -231,6 +231,15 @@ def _do_aggregations( | |||||||||
| for col in group_columns: | ||||||||||
| collected_aggregations[None].append((col, col, "first")) | ||||||||||
|
|
||||||||||
| groupby_agg_options = context.schema[ | ||||||||||
| context.schema_name | ||||||||||
|
Comment on lines
+234
to
+235
Collaborator
Author
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. I was giving this a bit more thought and it seems like dask-sql/dask_sql/physical/rel/logical/aggregate.py Lines 285 to 287 in 5b8f8a9
Perhaps @rajagurunath or someone more familiar with this part of the codebase can provide insight here. |
||||||||||
| ].config.get_config_by_prefix("dask.groupby.aggregate") | ||||||||||
| # Update the config string to only include the actual param value | ||||||||||
| # i.e. dask.groupby.aggregate.split_out -> split_out | ||||||||||
charlesbluca marked this conversation as resolved.
Show resolved
Hide resolved
|
||||||||||
| for config_key in list(groupby_agg_options.keys()): | ||||||||||
| groupby_agg_options[ | ||||||||||
| config_key.rpartition(".")[2] | ||||||||||
| ] = groupby_agg_options.pop(config_key) | ||||||||||
|
Comment on lines
+239
to
+242
Collaborator
Author
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. I was initially planning on having this logic in
Collaborator
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Thinking about this more, would it make sense to consolidate this functionality in
Suggested change
|
||||||||||
| # Now we can go ahead and use these grouped aggregations | ||||||||||
| # to perform the actual aggregation | ||||||||||
| # It is very important to start with the non-filtered entry. | ||||||||||
|
|
@@ -240,13 +249,23 @@ def _do_aggregations( | |||||||||
| if key in collected_aggregations: | ||||||||||
| aggregations = collected_aggregations.pop(key) | ||||||||||
| df_result = self._perform_aggregation( | ||||||||||
| df, None, aggregations, additional_column_name, group_columns, | ||||||||||
| df, | ||||||||||
| None, | ||||||||||
| aggregations, | ||||||||||
| additional_column_name, | ||||||||||
| group_columns, | ||||||||||
| groupby_agg_options, | ||||||||||
| ) | ||||||||||
|
|
||||||||||
| # Now we can also the the rest | ||||||||||
| for filter_column, aggregations in collected_aggregations.items(): | ||||||||||
| agg_result = self._perform_aggregation( | ||||||||||
| df, filter_column, aggregations, additional_column_name, group_columns, | ||||||||||
| df, | ||||||||||
| filter_column, | ||||||||||
| aggregations, | ||||||||||
| additional_column_name, | ||||||||||
| group_columns, | ||||||||||
| groupby_agg_options, | ||||||||||
| ) | ||||||||||
|
|
||||||||||
| # ... and finally concat the new data with the already present columns | ||||||||||
|
|
@@ -358,6 +377,7 @@ def _perform_aggregation( | |||||||||
| aggregations: List[Tuple[str, str, Any]], | ||||||||||
| additional_column_name: str, | ||||||||||
| group_columns: List[str], | ||||||||||
| groupby_agg_options: Dict[str, Any] = {}, | ||||||||||
| ): | ||||||||||
| tmp_df = df | ||||||||||
|
|
||||||||||
|
|
@@ -382,7 +402,7 @@ def _perform_aggregation( | |||||||||
|
|
||||||||||
| # Now apply the aggregation | ||||||||||
| logger.debug(f"Performing aggregation {dict(aggregations_dict)}") | ||||||||||
| agg_result = grouped_df.agg(aggregations_dict) | ||||||||||
| agg_result = grouped_df.agg(aggregations_dict, **groupby_agg_options) | ||||||||||
|
|
||||||||||
| # ... fix the column names to a single level ... | ||||||||||
| agg_result.columns = agg_result.columns.get_level_values(-1) | ||||||||||
|
|
||||||||||
Uh oh!
There was an error while loading. Please reload this page.