dask-contrib · ayushdg · May 31, 2022 · Mar 26, 2022 · Mar 31, 2022 · Mar 31, 2022
diff --git a/continuous_integration/environment-3.9-dev.yaml b/continuous_integration/environment-3.9-dev.yaml
@@ -2,8 +2,6 @@ name: dask-sql
 channels:
 - conda-forge
 - nodefaults
-- rapidsai-nightly
-- nvidia
 dependencies:
 - adagio>=0.2.3
 - antlr4-python3-runtime>=4.9.2, <4.10.0 # Remove max pin after qpd(fugue dependency) updates their conda recipe

diff --git a/dask_planner/src/expression.rs b/dask_planner/src/expression.rs
@@ -17,15 +17,16 @@ use datafusion::prelude::Column;
 
 use crate::sql::exceptions::py_runtime_err;
 use datafusion::common::DFField;
-use datafusion::logical_plan::exprlist_to_fields;
+use datafusion::logical_plan::{exprlist_to_fields, DFSchema};
 use std::sync::Arc;
 
 /// An PyExpr that can be used on a DataFrame
 #[pyclass(name = "Expression", module = "datafusion", subclass)]
 #[derive(Debug, Clone)]
 pub struct PyExpr {
-    pub input_plan: Option<Arc<LogicalPlan>>,
     pub expr: Expr,
+    // Why a Vec here? Because BinaryExpr on Join might have multiple LogicalPlans
+    pub input_plan: Option<Vec<Arc<LogicalPlan>>>,
 }
 
 impl From<PyExpr> for Expr {
@@ -57,7 +58,7 @@ impl PyExpr {
     /// However in this case Expr does not contain the contextual
     /// `LogicalPlan` instance that we need so we need to make a instance
     /// function to take and create the PyExpr.
-    pub fn from(expr: Expr, input: Option<Arc<LogicalPlan>>) -> PyExpr {
+    pub fn from(expr: Expr, input: Option<Vec<Arc<LogicalPlan>>>) -> PyExpr {
         PyExpr {
             input_plan: input,
             expr: expr,
@@ -67,7 +68,7 @@ impl PyExpr {
     /// Determines the name of the `Expr` instance by examining the LogicalPlan
     pub fn _column_name(&self, plan: &LogicalPlan) -> Result<String> {
         let field = expr_to_field(&self.expr, &plan)?;
-        Ok(field.unqualified_column().name.clone())
+        Ok(field.qualified_column().flat_name().clone())
     }
 
     fn _rex_type(&self, expr: &Expr) -> RexType {
@@ -123,16 +124,56 @@ impl PyExpr {
     /// Gets the positional index of the Expr instance from the LogicalPlan DFSchema
     #[pyo3(name = "getIndex")]
     pub fn index(&self) -> PyResult<usize> {
-        let input: &Option<Arc<LogicalPlan>> = &self.input_plan;
+        let input: &Option<Vec<Arc<LogicalPlan>>> = &self.input_plan;
         match input {
-            Some(plan) => {
-                let name: Result<String> = self.expr.name(plan.schema());
-                match name {
-                    Ok(fq_name) => Ok(plan
-                        .schema()
-                        .index_of_column(&Column::from_qualified_name(&fq_name))
-                        .unwrap()),
-                    Err(e) => panic!("{:?}", e),
+            Some(input_plans) => {
+                if input_plans.len() == 1 {
+                    let name: Result<String> = self.expr.name(input_plans[0].schema());
+                    match name {
+                        Ok(fq_name) => Ok(input_plans[0]
+                            .schema()
+                            .index_of_column(&Column::from_qualified_name(&fq_name))
+                            .unwrap()),
+                        Err(e) => panic!("{:?}", e),
+                    }
+                } else if input_plans.len() >= 2 {
+                    let mut base_schema: DFSchema = (**input_plans[0].schema()).clone();
+                    for input_idx in 1..input_plans.len() {
+                        let input_schema: DFSchema = (**input_plans[input_idx].schema()).clone();
+                        base_schema.merge(&input_schema);
+                    }
+                    let name: Result<String> = self.expr.name(&base_schema);
+                    match name {
+                        Ok(fq_name) => {
+                            let idx: Result<usize> =
+                                base_schema.index_of_column(&Column::from_qualified_name(&fq_name));
+                            match idx {
+                                Ok(index) => Ok(index),
+                                Err(e) => {
+                                    // This logic is encountered when an non-qualified column name is
+                                    // provided AND there exists more than one entry with that
+                                    // unqualified. This logic will attempt to narrow down to the
+                                    // qualified column name.
+                                    let qualified_fields: Vec<&DFField> =
+                                        base_schema.fields_with_unqualified_name(&fq_name);
+                                    for qf in &qualified_fields {
+                                        if qf.name().eq(&fq_name) {
+                                            let qualifier: String = qf.qualifier().unwrap().clone();
+                                            let qual: Option<&str> = Some(&qualifier);
+                                            let index: usize = base_schema
+                                                .index_of_column_by_name(qual, &qf.name())
+                                                .unwrap();
+                                            return Ok(index);
+                                        }
+                                    }
+                                    panic!("Unable to find match for column with name: '{}' in DFSchema", &fq_name);
+                                }
+                            }
+                        }
+                        Err(e) => panic!("{:?}", e),
+                    }
+                } else {
+                    panic!("Not really sure what we should do right here???");
                 }
             }
             None => {

diff --git a/dask_planner/src/sql/logical.rs b/dask_planner/src/sql/logical.rs
@@ -14,7 +14,8 @@ mod sort;
 
 use datafusion::logical_expr::LogicalPlan;
 
-use datafusion::common::Result;
+use datafusion::common::{DataFusionError, Result};
+use datafusion::logical_plan::DFSchemaRef;
 use datafusion::prelude::Column;
 
 use crate::sql::exceptions::py_type_err;
@@ -116,15 +117,6 @@ impl PyLogicalPlan {
         Ok(py_inputs)
     }
 
-    /// Examines the current_node and get the fields associated with it
-    pub fn get_field_names(&mut self) -> PyResult<Vec<String>> {
-        let mut field_names: Vec<String> = Vec::new();
-        for field in self.current_node().schema().fields() {
-            field_names.push(String::from(field.name()));
-        }
-        Ok(field_names)
-    }
-
     /// If the LogicalPlan represents access to a Table that instance is returned
     /// otherwise None is returned
     #[pyo3(name = "getTable")]
@@ -137,6 +129,31 @@ impl PyLogicalPlan {
         }
     }
 
+    #[pyo3(name = "getCurrentNodeSchemaName")]
+    pub fn get_current_node_schema_name(&self) -> PyResult<&str> {
+        match &self.current_node {
+            Some(e) => {
+                let sch: &DFSchemaRef = e.schema();
+                //TODO: Where can I actually get this in the context of the running query?
+                Ok("root")
+            }
+            None => Err(py_type_err(DataFusionError::Plan(format!(
+                "Current schema not found. Defaulting to {:?}",
+                "root"
+            )))),
+        }
+    }
+
+    #[pyo3(name = "getCurrentNodeTableName")]
+    pub fn get_current_node_table_name(&mut self) -> PyResult<String> {
+        match self.table() {
+            Ok(dask_table) => Ok(dask_table.name.clone()),
+            Err(_e) => Err(PyErr::new::<pyo3::exceptions::PyTypeError, _>(
+                "Unable to determine current node table name",
+            )),
+        }
+    }
+
     /// Gets the Relation "type" of the current node. Ex: Projection, TableScan, etc
     pub fn get_current_node_type(&mut self) -> PyResult<&str> {
         Ok(match self.current_node() {

diff --git a/dask_planner/src/sql/logical/aggregate.rs b/dask_planner/src/sql/logical/aggregate.rs
@@ -21,7 +21,7 @@ impl PyAggregate {
         for expr in &self.aggregate.group_expr {
             group_exprs.push(PyExpr::from(
                 expr.clone(),
-                Some(self.aggregate.input.clone()),
+                Some(vec![self.aggregate.input.clone()]),
             ));
         }
         Ok(group_exprs)
@@ -33,7 +33,7 @@ impl PyAggregate {
         for expr in &self.aggregate.aggr_expr {
             agg_exprs.push(PyExpr::from(
                 expr.clone(),
-                Some(self.aggregate.input.clone()),
+                Some(vec![self.aggregate.input.clone()]),
             ));
         }
         Ok(agg_exprs)
@@ -54,7 +54,7 @@ impl PyAggregate {
                 let mut exprs: Vec<PyExpr> = Vec::new();
                 for expr in args {
                     exprs.push(PyExpr {
-                        input_plan: Some(self.aggregate.input.clone()),
+                        input_plan: Some(vec![self.aggregate.input.clone()]),
                         expr: expr,
                     });
                 }

diff --git a/dask_planner/src/sql/logical/filter.rs b/dask_planner/src/sql/logical/filter.rs
@@ -19,7 +19,7 @@ impl PyFilter {
     pub fn get_condition(&mut self) -> PyResult<PyExpr> {
         Ok(PyExpr::from(
             self.filter.predicate.clone(),
-            Some(self.filter.input.clone()),
+            Some(vec![self.filter.input.clone()]),
         ))
     }
 }

diff --git a/dask_planner/src/sql/logical/join.rs b/dask_planner/src/sql/logical/join.rs
@@ -1,7 +1,11 @@
+use crate::expression::PyExpr;
 use crate::sql::column;
 
+use datafusion::physical_plan::expressions::Column;
+
 use datafusion::logical_expr::logical_plan::Join;
-use datafusion::logical_plan::{JoinType, LogicalPlan};
+use datafusion::logical_plan::{JoinType, LogicalPlan, Operator};
+use datafusion::prelude::{col, Expr};
 
 use crate::sql::exceptions::py_type_err;
 use pyo3::prelude::*;
@@ -14,6 +18,40 @@ pub struct PyJoin {
 
 #[pymethods]
 impl PyJoin {
+    #[pyo3(name = "getCondition")]
+    pub fn join_condition(&self) -> PyExpr {
+        // TODO: This logic should be altered once https://github.com/apache/arrow-datafusion/issues/2496 is complete
+        if self.join.on.len() >= 1 {
+            let (left_col, right_col) = &self.join.on[0];
+            let mut root_expr: Expr = Expr::BinaryExpr {
+                left: Box::new(Expr::Column(left_col.clone())),
+                op: Operator::Eq,
+                right: Box::new(Expr::Column(right_col.clone())),
+            };
+            for idx in 1..self.join.on.len() {
+                let (left_col, right_col) = &self.join.on[idx];
+                let ex: Expr = Expr::BinaryExpr {
+                    left: Box::new(Expr::Column(left_col.clone())),
+                    op: Operator::Eq,
+                    right: Box::new(Expr::Column(right_col.clone())),
+                };
+
+                root_expr = Expr::BinaryExpr {
+                    left: Box::new(root_expr),
+                    op: Operator::Eq,
+                    right: Box::new(ex),
+                }
+            }
+            PyExpr::from(
+                root_expr,
+                Some(vec![self.join.left.clone(), self.join.right.clone()]),
+            )
+        } else {
+            panic!("Join Length: {}, Encountered a Join with more than a single column for the join condition. This is not currently supported
+            until DataFusion makes some changes to allow for Joining logic other than just Equijoin.", self.join.on.len())
+        }
+    }
+
     #[pyo3(name = "getJoinConditions")]
     pub fn join_conditions(&mut self) -> PyResult<Vec<(column::PyColumn, column::PyColumn)>> {
         let lhs_table_name: String = match &*self.join.left {

diff --git a/dask_planner/src/sql/logical/limit.rs b/dask_planner/src/sql/logical/limit.rs
@@ -18,7 +18,7 @@ impl PyLimit {
     pub fn limit_n(&self) -> PyResult<PyExpr> {
         Ok(PyExpr::from(
             Expr::Literal(ScalarValue::UInt64(Some(self.limit.n.try_into().unwrap()))),
-            Some(self.limit.input.clone()),
+            Some(vec![self.limit.input.clone()]),
         ))
     }
 }

diff --git a/dask_planner/src/sql/logical/offset.rs b/dask_planner/src/sql/logical/offset.rs
@@ -18,7 +18,7 @@ impl PyOffset {
     pub fn offset(&self) -> PyResult<PyExpr> {
         Ok(PyExpr::from(
             Expr::Literal(ScalarValue::UInt64(Some(self.offset.offset as u64))),
-            Some(self.offset.input.clone()),
+            Some(vec![self.offset.input.clone()]),
         ))
     }
 
@@ -27,7 +27,7 @@ impl PyOffset {
         // TODO: Still need to implement fetch size! For now get everything from offset on with '0'
         Ok(PyExpr::from(
             Expr::Literal(ScalarValue::UInt64(Some(0))),
-            Some(self.offset.input.clone()),
+            Some(vec![self.offset.input.clone()]),
         ))
     }
 }

diff --git a/dask_planner/src/sql/logical/projection.rs b/dask_planner/src/sql/logical/projection.rs
@@ -19,7 +19,7 @@ impl PyProjection {
         match &local_expr.expr {
             Expr::Alias(expr, _name) => {
                 let py_expr: PyExpr =
-                    PyExpr::from(*expr.clone(), Some(self.projection.input.clone()));
+                    PyExpr::from(*expr.clone(), Some(vec![self.projection.input.clone()]));
                 projs.extend_from_slice(self.projected_expressions(&py_expr).as_slice());
             }
             _ => projs.push(local_expr.clone()),
@@ -34,8 +34,8 @@ impl PyProjection {
     fn named_projects(&mut self) -> PyResult<Vec<(String, PyExpr)>> {
         let mut named: Vec<(String, PyExpr)> = Vec::new();
         for expression in self.projection.expr.clone() {
-            let mut py_expr: PyExpr = PyExpr::from(expression, Some(self.projection.input.clone()));
-            py_expr.input_plan = Some(self.projection.input.clone());
+            let mut py_expr: PyExpr =
+                PyExpr::from(expression, Some(vec![self.projection.input.clone()]));
             for expr in self.projected_expressions(&py_expr) {
                 if let Ok(name) = expr._column_name(&*self.projection.input) {
                     named.push((name, expr.clone()));

diff --git a/dask_planner/src/sql/logical/sort.rs b/dask_planner/src/sql/logical/sort.rs
@@ -39,7 +39,10 @@ impl PySort {
     pub fn sort_expressions(&self) -> PyResult<Vec<PyExpr>> {
         let mut sort_exprs: Vec<PyExpr> = Vec::new();
         for expr in &self.sort.expr {
-            sort_exprs.push(PyExpr::from(expr.clone(), Some(self.sort.input.clone())));
+            sort_exprs.push(PyExpr::from(
+                expr.clone(),
+                Some(vec![self.sort.input.clone()]),
+            ));
         }
         Ok(sort_exprs)
     }

diff --git a/dask_planner/src/sql/table.rs b/dask_planner/src/sql/table.rs
@@ -177,6 +177,10 @@ pub(crate) fn table_from_logical_plan(plan: &LogicalPlan) -> Option<DaskTable> {
             table_from_logical_plan(&join.left)
         }
         LogicalPlan::Aggregate(agg) => table_from_logical_plan(&agg.input),
-        _ => todo!("table_from_logical_plan: unimplemented LogicalPlan type encountered"),
+        LogicalPlan::SubqueryAlias(alias) => table_from_logical_plan(&alias.input),
+        _ => todo!(
+            "table_from_logical_plan: unimplemented LogicalPlan type {:?} encountered",
+            plan
+        ),
     }
 }
diff --git a/dask_planner/src/sql/types/rel_data_type.rs b/dask_planner/src/sql/types/rel_data_type.rs
@@ -8,7 +8,7 @@ const PRECISION_NOT_SPECIFIED: i32 = i32::MIN;
 const SCALE_NOT_SPECIFIED: i32 = -1;
 
 /// RelDataType represents the type of a scalar expression or entire row returned from a relational expression.
-#[pyclass]
+#[pyclass(name = "RelDataType", module = "dask_planner", subclass)]
 #[derive(Debug, Clone, PartialEq, Eq, Hash, PartialOrd, Ord)]
 pub struct RelDataType {
     nullable: bool,
@@ -83,7 +83,7 @@ impl RelDataType {
         assert!(!self.field_list.is_empty());
         let mut field_names: Vec<String> = Vec::new();
         for field in &self.field_list {
-            field_names.push(String::from(field.name()));
+            field_names.push(String::from(field.qualified_name()));
         }
         field_names
     }