diff --git a/.github/docker-compose.yaml b/.github/docker-compose.yaml index 6edf29e02..cfb7eb43f 100644 --- a/.github/docker-compose.yaml +++ b/.github/docker-compose.yaml @@ -7,13 +7,9 @@ services: command: dask-scheduler ports: - "8786:8786" - environment: - EXTRA_CONDA_PACKAGES: "pandas>=1.0.0" dask-worker: container_name: dask-worker image: daskdev/dask:latest command: dask-worker dask-scheduler:8786 - environment: - EXTRA_CONDA_PACKAGES: "pandas>=1.0.0" volumes: - /tmp:/tmp diff --git a/.github/workflows/test.yml b/.github/workflows/test.yml index bd253c5d2..b590260f7 100644 --- a/.github/workflows/test.yml +++ b/.github/workflows/test.yml @@ -180,9 +180,7 @@ jobs: docker logs dask-worker - name: Test with pytest while running an independent dask cluster run: | - pytest tests - env: - DASK_SQL_TEST_SCHEDULER: tcp://127.0.0.1:8786 + DASK_SQL_TEST_SCHEDULER="tcp://127.0.0.1:8786" pytest tests import: name: "Test importing with bare requirements" diff --git a/tests/integration/fixtures.py b/tests/integration/fixtures.py index ed4aa13e1..454c2ce5c 100644 --- a/tests/integration/fixtures.py +++ b/tests/integration/fixtures.py @@ -6,8 +6,7 @@ import pandas as pd import pytest from dask.datasets import timeseries -from dask.distributed import Client, LocalCluster -from dask.distributed.utils_test import loop # noqa: F401 +from dask.distributed import Client from pandas.testing import assert_frame_equal try: @@ -287,40 +286,23 @@ def gpu_cluster(): pytest.skip("dask_cuda not installed") return None - cluster = LocalCUDACluster(protocol="tcp") - yield cluster - cluster.close() + with LocalCUDACluster(protocol="tcp") as cluster: + yield cluster @pytest.fixture() def gpu_client(gpu_cluster): if gpu_cluster: - client = Client(gpu_cluster) - yield client - client.close() + with Client(gpu_cluster) as client: + yield client @pytest.fixture(scope="session", autouse=True) -def setup_dask_client(): - """Setup a dask client if requested""" - address = os.getenv("DASK_SQL_TEST_SCHEDULER", None) - if address: - client = Client(address) +def client(): + yield Client(address=os.getenv("DASK_SQL_TEST_SCHEDULER", None)) skip_if_external_scheduler = pytest.mark.skipif( os.getenv("DASK_SQL_TEST_SCHEDULER", None) is not None, reason="Can not run with external cluster", ) - - -@pytest.fixture() -def cluster(loop): # noqa: F811 - with LocalCluster(loop=loop) as cluster: - yield cluster - - -@pytest.fixture() -def client(cluster): - with Client(cluster) as client: - yield client diff --git a/tests/integration/test_cmd.py b/tests/integration/test_cmd.py index 5b07e4407..dd8b05083 100644 --- a/tests/integration/test_cmd.py +++ b/tests/integration/test_cmd.py @@ -100,9 +100,7 @@ def test_meta_commands(c, client, capsys): assert "Schema not_exists not available\n" == captured.out with pytest.raises( - OSError, - match="Timed out during handshake while " - "connecting to tcp://localhost:8787 after 5 s", + OSError, match="Timed out .* to tcp://localhost:8787 after 5 s", ): with dask_config.set({"distributed.comm.timeouts.connect": 5}): client = _meta_commands("\\dsc localhost:8787", context=c, client=client) @@ -120,8 +118,9 @@ def test_connection_info(c, client, capsys): def test_quit(c, client, capsys): + dummy_client = MagicMock() with patch("sys.exit", return_value=lambda: "exit"): - _meta_commands("quit", context=c, client=client) + _meta_commands("quit", context=c, client=dummy_client) captured = capsys.readouterr() assert captured.out == "Quitting dask-sql ...\n" diff --git a/tests/integration/test_create.py b/tests/integration/test_create.py index c768ef8bd..b6d513f4e 100644 --- a/tests/integration/test_create.py +++ b/tests/integration/test_create.py @@ -63,6 +63,8 @@ def test_cluster_memory(client, c, df, gpu): assert_frame_equal(df, return_df) + client.unpublish_dataset("df") + @pytest.mark.parametrize("gpu", [False, pytest.param(True, marks=pytest.mark.gpu)]) def test_create_from_csv_persist(c, df, temporary_data_file, gpu): diff --git a/tests/integration/test_jdbc.py b/tests/integration/test_jdbc.py index 355f1a2fb..f8426ae46 100644 --- a/tests/integration/test_jdbc.py +++ b/tests/integration/test_jdbc.py @@ -37,8 +37,6 @@ def app_client(c): yield TestClient(app) - app.client.close() - def test_jdbc_has_schema(app_client, c): create_meta_data(c) diff --git a/tests/integration/test_model.py b/tests/integration/test_model.py index 042800711..791ac0722 100644 --- a/tests/integration/test_model.py +++ b/tests/integration/test_model.py @@ -63,6 +63,8 @@ def gpu_training_df(c): return None +# TODO - many ML tests fail on clusters without sklearn - can we avoid this? +@skip_if_external_scheduler def test_training_and_prediction(c, training_df): c.sql( """ @@ -149,6 +151,8 @@ def test_xgboost_training_prediction(c, gpu_training_df): check_trained_model(c) +# TODO - many ML tests fail on clusters without sklearn - can we avoid this? +@skip_if_external_scheduler def test_clustering_and_prediction(c, training_df): c.sql( """ @@ -165,6 +169,8 @@ def test_clustering_and_prediction(c, training_df): check_trained_model(c) +# TODO - many ML tests fail on clusters without sklearn - can we avoid this? +@skip_if_external_scheduler def test_iterative_and_prediction(c, training_df): c.sql( """ @@ -184,6 +190,8 @@ def test_iterative_and_prediction(c, training_df): check_trained_model(c) +# TODO - many ML tests fail on clusters without sklearn - can we avoid this? +@skip_if_external_scheduler def test_show_models(c, training_df): c.sql( """ @@ -403,6 +411,8 @@ def test_drop_model(c, training_df): assert "my_model" not in c.schema[c.schema_name].models +# TODO - many ML tests fail on clusters without sklearn - can we avoid this? +@skip_if_external_scheduler def test_describe_model(c, training_df): c.sql( """ @@ -504,6 +514,8 @@ def test_export_model(c, training_df, tmpdir): ) +# TODO - many ML tests fail on clusters without sklearn - can we avoid this? +@skip_if_external_scheduler def test_mlflow_export(c, training_df, tmpdir): # Test only when mlflow was installed mlflow = pytest.importorskip("mlflow", reason="mlflow not installed") @@ -560,10 +572,12 @@ def test_mlflow_export(c, training_df, tmpdir): ) +# TODO - many ML tests fail on clusters without sklearn - can we avoid this? @pytest.mark.xfail( sys.platform == "win32", reason="Windows is not officially supported for dask/xgboost", ) +@skip_if_external_scheduler def test_mlflow_export_xgboost(c, client, training_df, tmpdir): # Test only when mlflow & xgboost was installed mlflow = pytest.importorskip("mlflow", reason="mlflow not installed") @@ -626,6 +640,8 @@ def test_mlflow_export_lightgbm(c, training_df, tmpdir): ) +# TODO - many ML tests fail on clusters without sklearn - can we avoid this? +@skip_if_external_scheduler def test_ml_experiment(c, client, training_df): with pytest.raises( @@ -818,6 +834,8 @@ def test_ml_experiment(c, client, training_df): ) +# TODO - many ML tests fail on clusters without sklearn - can we avoid this? +@skip_if_external_scheduler def test_experiment_automl_classifier(c, client, training_df): tpot = pytest.importorskip("tpot", reason="tpot not installed") # currently tested with tpot== @@ -841,6 +859,8 @@ def test_experiment_automl_classifier(c, client, training_df): check_trained_model(c, "my_automl_exp1") +# TODO - many ML tests fail on clusters without sklearn - can we avoid this? +@skip_if_external_scheduler def test_experiment_automl_regressor(c, client, training_df): tpot = pytest.importorskip("tpot", reason="tpot not installed") # test regressor