dask-contrib · charlesbluca · Mar 24, 2022 · Feb 23, 2022 · Mar 4, 2022 · Mar 4, 2022
@@ -119,9 +119,10 @@ def merge_single_partitions(lhs_partition, rhs_partition):
                 # which is definitely not possible (java dependency, JVM start...)
                 lhs_partition = lhs_partition.assign(common=1)
                 rhs_partition = rhs_partition.assign(common=1)
-                merged_data = lhs_partition.merge(rhs_partition, on=["common"])
 
-                return merged_data
+                return lhs_partition.merge(rhs_partition, on="common").drop(
+                    columns="common"
+                )
 
             # Iterate nested over all partitions from lhs and rhs and merge them
             name = "cross-join-" + tokenize(df_lhs_renamed, df_rhs_renamed)
@@ -140,11 +141,7 @@ def merge_single_partitions(lhs_partition, rhs_partition):
             )
 
             meta = dd.dispatch.concat(
-                [
-                    df_lhs_renamed._meta_nonempty.assign(common=1),
-                    df_rhs_renamed._meta_nonempty,
-                ],
-                axis=1,
+                [df_lhs_renamed._meta_nonempty, df_rhs_renamed._meta_nonempty], axis=1
             )
             # TODO: Do we know the divisions in any way here?
             divisions = [None] * (len(dsk) + 1)

@@ -1,3 +1,4 @@
+import dask.dataframe as dd
 import numpy as np
 import pandas as pd
 from dask.dataframe.utils import assert_eq
@@ -266,3 +267,26 @@ def test_join_case_projection_subquery():
     ) sum_am_pm
     """
     ).compute()
+
+
+def test_conditional_join_with_limit(c):
+    df = pd.DataFrame({"a": [1, 2, 3, 4], "b": [5, 6, 7, 8]})
+    ddf = dd.from_pandas(df, 5)
+
+    c.create_table("many_partitions", ddf)
+
+    df = df.assign(common=1)
+    expected_df = df.merge(df, on="common", suffixes=("", "0")).drop(columns="common")
+    expected_df = expected_df[expected_df["a"] >= 2][:4]
+
+    actual_df = c.sql(
+        """
+    SELECT * FROM
+        many_partitions as df1, many_partitions as df2
+    WHERE
+        df1."a" >= 2
+    LIMIT 4
+    """
+    )
+
+    dd.assert_eq(actual_df, expected_df, check_index=False)