|
1 | | -def test_sample(c, df): |
2 | | - # Fixed sample, check absolute numbers |
3 | | - return_df = c.sql("SELECT * FROM df TABLESAMPLE SYSTEM (20) REPEATABLE (10)") |
4 | | - |
5 | | - assert len(return_df) == 234 |
6 | | - |
7 | | - return_df = c.sql("SELECT * FROM df TABLESAMPLE SYSTEM (20) REPEATABLE (11)") |
8 | | - |
9 | | - assert len(return_df) == 468 # Yes, that is horrible, but at least fast... |
10 | | - |
11 | | - return_df = c.sql("SELECT * FROM df TABLESAMPLE SYSTEM (50) REPEATABLE (10)") |
12 | | - |
13 | | - assert len(return_df) == 234 |
14 | | - |
15 | | - return_df = c.sql("SELECT * FROM df TABLESAMPLE SYSTEM (0.001) REPEATABLE (10)") |
16 | | - |
17 | | - assert len(return_df) == 0 |
| 1 | +import numpy as np |
18 | 2 |
|
19 | | - return_df = c.sql("SELECT * FROM df TABLESAMPLE SYSTEM (99.999) REPEATABLE (10)") |
| 3 | +from tests.utils import assert_eq |
20 | 4 |
|
21 | | - assert len(return_df) == len(df) |
22 | 5 |
|
23 | | - return_df = c.sql("SELECT * FROM df TABLESAMPLE BERNOULLI (50) REPEATABLE (10)") |
| 6 | +def get_system_sample(df, fraction, seed): |
| 7 | + random_state = np.random.RandomState(seed) |
| 8 | + random_choice = random_state.choice( |
| 9 | + [True, False], |
| 10 | + size=df.npartitions, |
| 11 | + replace=True, |
| 12 | + p=[fraction, 1 - fraction], |
| 13 | + ) |
24 | 14 |
|
25 | | - assert len(return_df) == 350 |
| 15 | + if random_choice.any(): |
| 16 | + df = df.partitions[random_choice] |
| 17 | + else: |
| 18 | + df = df.head(0, compute=False) |
26 | 19 |
|
27 | | - return_df = c.sql("SELECT * FROM df TABLESAMPLE BERNOULLI (70) REPEATABLE (10)") |
| 20 | + return df |
28 | 21 |
|
29 | | - assert len(return_df) == 490 |
30 | 22 |
|
31 | | - return_df = c.sql("SELECT * FROM df TABLESAMPLE BERNOULLI (0.001) REPEATABLE (10)") |
32 | | - |
33 | | - assert len(return_df) == 0 |
34 | | - |
35 | | - return_df = c.sql("SELECT * FROM df TABLESAMPLE BERNOULLI (99.999) REPEATABLE (10)") |
36 | | - |
37 | | - assert len(return_df) == len(df) |
38 | | - |
39 | | - # Not fixed sample, can only check boundaries |
| 23 | +def test_sample(c, df): |
| 24 | + ddf = c.sql("SELECT * FROM df") |
| 25 | + |
| 26 | + # fixed system samples |
| 27 | + assert_eq( |
| 28 | + c.sql("SELECT * FROM df TABLESAMPLE SYSTEM (20) REPEATABLE (10)"), |
| 29 | + get_system_sample(ddf, 0.20, 10), |
| 30 | + ) |
| 31 | + assert_eq( |
| 32 | + c.sql("SELECT * FROM df TABLESAMPLE SYSTEM (20) REPEATABLE (11)"), |
| 33 | + get_system_sample(ddf, 0.20, 11), |
| 34 | + ) |
| 35 | + assert_eq( |
| 36 | + c.sql("SELECT * FROM df TABLESAMPLE SYSTEM (50) REPEATABLE (10)"), |
| 37 | + get_system_sample(ddf, 0.50, 10), |
| 38 | + ) |
| 39 | + assert_eq( |
| 40 | + c.sql("SELECT * FROM df TABLESAMPLE SYSTEM (0.001) REPEATABLE (10)"), |
| 41 | + get_system_sample(ddf, 0.00001, 10), |
| 42 | + ) |
| 43 | + assert_eq( |
| 44 | + c.sql("SELECT * FROM df TABLESAMPLE SYSTEM (99.999) REPEATABLE (10)"), |
| 45 | + get_system_sample(ddf, 0.99999, 10), |
| 46 | + ) |
| 47 | + |
| 48 | + # fixed bernoulli samples |
| 49 | + assert_eq( |
| 50 | + c.sql("SELECT * FROM df TABLESAMPLE BERNOULLI (50) REPEATABLE (10)"), |
| 51 | + ddf.sample(frac=0.50, replace=False, random_state=10), |
| 52 | + ) |
| 53 | + assert_eq( |
| 54 | + c.sql("SELECT * FROM df TABLESAMPLE BERNOULLI (70) REPEATABLE (10)"), |
| 55 | + ddf.sample(frac=0.70, replace=False, random_state=10), |
| 56 | + ) |
| 57 | + assert_eq( |
| 58 | + c.sql("SELECT * FROM df TABLESAMPLE BERNOULLI (0.001) REPEATABLE (10)"), |
| 59 | + ddf.sample(frac=0.00001, replace=False, random_state=10), |
| 60 | + ) |
| 61 | + assert_eq( |
| 62 | + c.sql("SELECT * FROM df TABLESAMPLE BERNOULLI (99.999) REPEATABLE (10)"), |
| 63 | + ddf.sample(frac=0.99999, replace=False, random_state=10), |
| 64 | + ) |
| 65 | + |
| 66 | + # variable samples, can only check boundaries |
40 | 67 | return_df = c.sql("SELECT * FROM df TABLESAMPLE BERNOULLI (50)") |
41 | | - |
42 | 68 | assert len(return_df) >= 0 and len(return_df) <= len(df) |
43 | 69 |
|
44 | 70 | return_df = c.sql("SELECT * FROM df TABLESAMPLE SYSTEM (50)") |
45 | | - |
46 | 71 | assert len(return_df) >= 0 and len(return_df) <= len(df) |
0 commit comments