dask-contrib · charlesbluca · Mar 23, 2022 · Mar 22, 2022 · Mar 22, 2022 · Mar 22, 2022
@@ -7,13 +7,9 @@ services:
         command: dask-scheduler
         ports:
             - "8786:8786"
-        environment:
-            EXTRA_CONDA_PACKAGES: "pandas>=1.0.0"
     dask-worker:
         container_name: dask-worker
         image: daskdev/dask:latest
         command: dask-worker dask-scheduler:8786
-        environment:
-            EXTRA_CONDA_PACKAGES: "pandas>=1.0.0"
         volumes:
             - /tmp:/tmp
@@ -180,9 +180,7 @@ jobs:
           docker logs dask-worker
       - name: Test with pytest while running an independent dask cluster
         run: |
-          pytest tests
-        env:
-          DASK_SQL_TEST_SCHEDULER: tcp://127.0.0.1:8786
+          DASK_SQL_TEST_SCHEDULER="tcp://127.0.0.1:8786" pytest tests
 
   import:
     name: "Test importing with bare requirements"

@@ -6,8 +6,7 @@
 import pandas as pd
 import pytest
 from dask.datasets import timeseries
-from dask.distributed import Client, LocalCluster
-from dask.distributed.utils_test import loop  # noqa: F401
+from dask.distributed import Client
 from pandas.testing import assert_frame_equal
 
 try:
@@ -287,40 +286,23 @@ def gpu_cluster():
         pytest.skip("dask_cuda not installed")
         return None
 
-    cluster = LocalCUDACluster(protocol="tcp")
-    yield cluster
-    cluster.close()
+    with LocalCUDACluster(protocol="tcp") as cluster:
+        yield cluster
 
 
 @pytest.fixture()
 def gpu_client(gpu_cluster):
     if gpu_cluster:
-        client = Client(gpu_cluster)
-        yield client
-        client.close()
+        with Client(gpu_cluster) as client:
+            yield client
 
 
 @pytest.fixture(scope="session", autouse=True)
-def setup_dask_client():
-    """Setup a dask client if requested"""
-    address = os.getenv("DASK_SQL_TEST_SCHEDULER", None)
-    if address:
-        client = Client(address)
+def client():
+    yield Client(address=os.getenv("DASK_SQL_TEST_SCHEDULER", None))
 
 
 skip_if_external_scheduler = pytest.mark.skipif(
     os.getenv("DASK_SQL_TEST_SCHEDULER", None) is not None,
     reason="Can not run with external cluster",
 )
-
-
-@pytest.fixture()
-def cluster(loop):  # noqa: F811
-    with LocalCluster(loop=loop) as cluster:
-        yield cluster
-
-
-@pytest.fixture()
-def client(cluster):
-    with Client(cluster) as client:
-        yield client
@@ -100,9 +100,7 @@ def test_meta_commands(c, client, capsys):
     assert "Schema not_exists not available\n" == captured.out
 
     with pytest.raises(
-        OSError,
-        match="Timed out during handshake while "
-        "connecting to tcp://localhost:8787 after 5 s",
+        OSError, match="Timed out .* to tcp://localhost:8787 after 5 s",
     ):
         with dask_config.set({"distributed.comm.timeouts.connect": 5}):
             client = _meta_commands("\\dsc localhost:8787", context=c, client=client)
@@ -120,8 +118,9 @@ def test_connection_info(c, client, capsys):
 
 
 def test_quit(c, client, capsys):
+    dummy_client = MagicMock()
     with patch("sys.exit", return_value=lambda: "exit"):
-        _meta_commands("quit", context=c, client=client)
+        _meta_commands("quit", context=c, client=dummy_client)
         captured = capsys.readouterr()
         assert captured.out == "Quitting dask-sql ...\n"
 

@@ -63,6 +63,8 @@ def test_cluster_memory(client, c, df, gpu):
 
     assert_frame_equal(df, return_df)
 
+    client.unpublish_dataset("df")
+
 
 @pytest.mark.parametrize("gpu", [False, pytest.param(True, marks=pytest.mark.gpu)])
 def test_create_from_csv_persist(c, df, temporary_data_file, gpu):

@@ -37,8 +37,6 @@ def app_client(c):
 
     yield TestClient(app)
 
-    app.client.close()
-
 
 def test_jdbc_has_schema(app_client, c):
     create_meta_data(c)

@@ -63,6 +63,8 @@ def gpu_training_df(c):
     return None
 
 
+# TODO - many ML tests fail on clusters without sklearn - can we avoid this?
+@skip_if_external_scheduler
 def test_training_and_prediction(c, training_df):
     c.sql(
         """
@@ -149,6 +151,8 @@ def test_xgboost_training_prediction(c, gpu_training_df):
     check_trained_model(c)
 
 
+# TODO - many ML tests fail on clusters without sklearn - can we avoid this?
+@skip_if_external_scheduler
 def test_clustering_and_prediction(c, training_df):
     c.sql(
         """
@@ -165,6 +169,8 @@ def test_clustering_and_prediction(c, training_df):
     check_trained_model(c)
 
 
+# TODO - many ML tests fail on clusters without sklearn - can we avoid this?
+@skip_if_external_scheduler
 def test_iterative_and_prediction(c, training_df):
     c.sql(
         """
@@ -184,6 +190,8 @@ def test_iterative_and_prediction(c, training_df):
     check_trained_model(c)
 
 
+# TODO - many ML tests fail on clusters without sklearn - can we avoid this?
+@skip_if_external_scheduler
 def test_show_models(c, training_df):
     c.sql(
         """
@@ -403,6 +411,8 @@ def test_drop_model(c, training_df):
     assert "my_model" not in c.schema[c.schema_name].models
 
 
+# TODO - many ML tests fail on clusters without sklearn - can we avoid this?
+@skip_if_external_scheduler
 def test_describe_model(c, training_df):
     c.sql(
         """
@@ -504,6 +514,8 @@ def test_export_model(c, training_df, tmpdir):
         )
 
 
+# TODO - many ML tests fail on clusters without sklearn - can we avoid this?
+@skip_if_external_scheduler
 def test_mlflow_export(c, training_df, tmpdir):
     # Test only when mlflow was installed
     mlflow = pytest.importorskip("mlflow", reason="mlflow not installed")
@@ -560,10 +572,12 @@ def test_mlflow_export(c, training_df, tmpdir):
         )
 
 
+# TODO - many ML tests fail on clusters without sklearn - can we avoid this?
 @pytest.mark.xfail(
     sys.platform == "win32",
     reason="Windows is not officially supported for dask/xgboost",
 )
+@skip_if_external_scheduler
 def test_mlflow_export_xgboost(c, client, training_df, tmpdir):
     # Test only when mlflow & xgboost was installed
     mlflow = pytest.importorskip("mlflow", reason="mlflow not installed")
@@ -626,6 +640,8 @@ def test_mlflow_export_lightgbm(c, training_df, tmpdir):
     )
 
 
+# TODO - many ML tests fail on clusters without sklearn - can we avoid this?
+@skip_if_external_scheduler
 def test_ml_experiment(c, client, training_df):
 
     with pytest.raises(
@@ -818,6 +834,8 @@ def test_ml_experiment(c, client, training_df):
         )
 
 
+# TODO - many ML tests fail on clusters without sklearn - can we avoid this?
+@skip_if_external_scheduler
 def test_experiment_automl_classifier(c, client, training_df):
     tpot = pytest.importorskip("tpot", reason="tpot not installed")
     # currently tested with tpot==
@@ -841,6 +859,8 @@ def test_experiment_automl_classifier(c, client, training_df):
     check_trained_model(c, "my_automl_exp1")
 
 
+# TODO - many ML tests fail on clusters without sklearn - can we avoid this?
+@skip_if_external_scheduler
 def test_experiment_automl_regressor(c, client, training_df):
     tpot = pytest.importorskip("tpot", reason="tpot not installed")
     # test regressor