Skip to content
27 changes: 22 additions & 5 deletions .github/workflows/self-scheduled.yml
Original file line number Diff line number Diff line change
@@ -1,3 +1,8 @@
# configuration notes:
#
# - `source .env/bin/activate` is currently needed to be run first thing first in each step. Otherwise
# the step uses the system-wide python interpreter.

name: Self-hosted runner (scheduled)

on:
Expand Down Expand Up @@ -227,7 +232,7 @@ jobs:
python -c "import torch; print('Cuda available:', torch.cuda.is_available())"
python -c "import torch; print('Number of GPUs available:', torch.cuda.device_count())"

- name: Run all tests on GPU
- name: Run all tests on multi-GPU
env:
OMP_NUM_THREADS: 1
RUN_SLOW: yes
Expand All @@ -238,8 +243,20 @@ jobs:
- name: Failure short reports
if: ${{ always() }}
run: cat reports/tests_torch_multiple_gpu_failures_short.txt

- name: Run all pipeline tests on GPU

- name: Run examples tests on multi-GPU
env:
OMP_NUM_THREADS: 1
RUN_SLOW: yes
run: |
source .env/bin/activate
python -m pytest -n 1 --dist=loadfile -s --make-reports=examples_torch_multiple_gpu examples

- name: Failure short reports
if: ${{ always() }}
run: cat reports/examples_torch_multiple_gpu_failures_short.txt

- name: Run all pipeline tests on multi-GPU
if: ${{ always() }}
env:
TF_FORCE_GPU_ALLOW_GROWTH: "true"
Expand Down Expand Up @@ -306,7 +323,7 @@ jobs:
TF_CPP_MIN_LOG_LEVEL=3 python -c "import tensorflow as tf; print('TF GPUs available:', bool(tf.config.list_physical_devices('GPU')))"
TF_CPP_MIN_LOG_LEVEL=3 python -c "import tensorflow as tf; print('Number of TF GPUs available:', len(tf.config.list_physical_devices('GPU')))"

- name: Run all tests on GPU
- name: Run all tests on multi-GPU
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Suggested change
- name: Run all tests on multi-GPU
- name: Run tests/ on multi-GPU

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

But it is RUN_SLOW=1 - so everywhere else in config it says "all tests" for this env var.

env:
OMP_NUM_THREADS: 1
RUN_SLOW: yes
Expand All @@ -318,7 +335,7 @@ jobs:
if: ${{ always() }}
run: cat reports/tests_tf_multiple_gpu_failures_short.txt

- name: Run all pipeline tests on GPU
- name: Run all pipeline tests on multi-GPU
if: ${{ always() }}
env:
TF_FORCE_GPU_ALLOW_GROWTH: "true"
Expand Down
3 changes: 2 additions & 1 deletion examples/bert-loses-patience/test_run_glue_with_pabee.py
Original file line number Diff line number Diff line change
Expand Up @@ -4,7 +4,7 @@
from unittest.mock import patch

import run_glue_with_pabee
from transformers.testing_utils import TestCasePlus
from transformers.testing_utils import TestCasePlus, require_torch_non_multigpu_but_fix_me


logging.basicConfig(level=logging.DEBUG)
Expand All @@ -20,6 +20,7 @@ def get_setup_file():


class PabeeTests(TestCasePlus):
@require_torch_non_multigpu_but_fix_me
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

this breaks if multigpu?

should the decorator be called require_torch_single_gpu?

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

We don't know, that's why it has "but_fix_me" - I just added it to all tests in examples, and now we need to go over all of them, review and either make each test work under multi-gpu or designate it as non_multi_gpu.

wrt naming let's discuss it in normal comments, since once resolved these get hidden.

def test_run_glue(self):
stream_handler = logging.StreamHandler(sys.stdout)
logger.addHandler(stream_handler)
Expand Down
3 changes: 2 additions & 1 deletion examples/deebert/test_glue_deebert.py
Original file line number Diff line number Diff line change
Expand Up @@ -5,7 +5,7 @@
from unittest.mock import patch

import run_glue_deebert
from transformers.testing_utils import slow
from transformers.testing_utils import require_torch_non_multigpu_but_fix_me, slow


logging.basicConfig(level=logging.DEBUG)
Expand All @@ -26,6 +26,7 @@ def setup(self) -> None:
logger.addHandler(stream_handler)

@slow
@require_torch_non_multigpu_but_fix_me
def test_glue_deebert_train(self):

train_args = """
Expand Down
4 changes: 4 additions & 0 deletions examples/rag/test_distributed_retriever.py
Original file line number Diff line number Diff line change
Expand Up @@ -16,6 +16,7 @@
from transformers.configuration_rag import RagConfig
from transformers.file_utils import is_datasets_available, is_faiss_available, is_psutil_available, is_torch_available
from transformers.retrieval_rag import CustomHFIndex
from transformers.testing_utils import require_torch_non_multigpu_but_fix_me
from transformers.tokenization_bart import BartTokenizer
from transformers.tokenization_bert import VOCAB_FILES_NAMES as DPR_VOCAB_FILES_NAMES
from transformers.tokenization_dpr import DPRQuestionEncoderTokenizer
Expand Down Expand Up @@ -178,6 +179,7 @@ def get_dummy_custom_hf_index_retriever(self, init_retrieval: bool, from_disk: b
retriever.init_retrieval(port)
return retriever

@require_torch_non_multigpu_but_fix_me
def test_pytorch_distributed_retriever_retrieve(self):
n_docs = 1
retriever = self.get_dummy_pytorch_distributed_retriever(init_retrieval=True)
Expand All @@ -193,6 +195,7 @@ def test_pytorch_distributed_retriever_retrieve(self):
self.assertEqual(doc_dicts[1]["id"][0], "0") # max inner product is reached with first doc
self.assertListEqual(doc_ids.tolist(), [[1], [0]])

@require_torch_non_multigpu_but_fix_me
def test_custom_hf_index_retriever_retrieve(self):
n_docs = 1
retriever = self.get_dummy_custom_hf_index_retriever(init_retrieval=True, from_disk=False)
Expand All @@ -208,6 +211,7 @@ def test_custom_hf_index_retriever_retrieve(self):
self.assertEqual(doc_dicts[1]["id"][0], "0") # max inner product is reached with first doc
self.assertListEqual(doc_ids.tolist(), [[1], [0]])

@require_torch_non_multigpu_but_fix_me
def test_custom_pytorch_distributed_retriever_retrieve_from_disk(self):
n_docs = 1
retriever = self.get_dummy_custom_hf_index_retriever(init_retrieval=True, from_disk=True)
Expand Down
5 changes: 4 additions & 1 deletion examples/seq2seq/test_bash_script.py
Original file line number Diff line number Diff line change
Expand Up @@ -13,7 +13,7 @@
from finetune import SummarizationModule, main
from transformers import MarianMTModel
from transformers.file_utils import cached_path
from transformers.testing_utils import TestCasePlus, require_torch_gpu, slow
from transformers.testing_utils import TestCasePlus, require_torch_gpu, require_torch_non_multigpu_but_fix_me, slow
from utils import load_json


Expand All @@ -32,13 +32,15 @@ def setUp(self):

@slow
@require_torch_gpu
@require_torch_non_multigpu_but_fix_me
def test_model_download(self):
"""This warms up the cache so that we can time the next test without including download time, which varies between machines."""
MarianMTModel.from_pretrained(MARIAN_MODEL)

# @timeout_decorator.timeout(1200)
@slow
@require_torch_gpu
@require_torch_non_multigpu_but_fix_me
def test_train_mbart_cc25_enro_script(self):
env_vars_to_replace = {
"$MAX_LEN": 64,
Expand Down Expand Up @@ -127,6 +129,7 @@ class TestDistilMarianNoTeacher(TestCasePlus):
@timeout_decorator.timeout(600)
@slow
@require_torch_gpu
@require_torch_non_multigpu_but_fix_me
def test_opus_mt_distill_script(self):
data_dir = f"{self.test_file_dir_str}/test_data/wmt_en_ro"
env_vars_to_replace = {
Expand Down
9 changes: 8 additions & 1 deletion examples/seq2seq/test_datasets.py
Original file line number Diff line number Diff line change
Expand Up @@ -11,7 +11,7 @@
from test_seq2seq_examples import ARTICLES, BART_TINY, MARIAN_TINY, MBART_TINY, SUMMARIES, T5_TINY, make_test_data_dir
from transformers import AutoTokenizer
from transformers.modeling_bart import shift_tokens_right
from transformers.testing_utils import TestCasePlus, slow
from transformers.testing_utils import TestCasePlus, require_torch_non_multigpu_but_fix_me, slow
from utils import FAIRSEQ_AVAILABLE, DistributedSortishSampler, LegacySeq2SeqDataset, Seq2SeqDataset


Expand All @@ -30,6 +30,7 @@ class TestAll(TestCasePlus):
],
)
@slow
@require_torch_non_multigpu_but_fix_me
def test_seq2seq_dataset_truncation(self, tok_name):
tokenizer = AutoTokenizer.from_pretrained(tok_name)
tmp_dir = make_test_data_dir(tmp_dir=self.get_auto_remove_tmp_dir())
Expand Down Expand Up @@ -69,6 +70,7 @@ def test_seq2seq_dataset_truncation(self, tok_name):
break # No need to test every batch

@parameterized.expand([BART_TINY, BERT_BASE_CASED])
@require_torch_non_multigpu_but_fix_me
def test_legacy_dataset_truncation(self, tok):
tokenizer = AutoTokenizer.from_pretrained(tok)
tmp_dir = make_test_data_dir(tmp_dir=self.get_auto_remove_tmp_dir())
Expand All @@ -93,6 +95,7 @@ def test_legacy_dataset_truncation(self, tok):
assert max_len_target > trunc_target # Truncated
break # No need to test every batch

@require_torch_non_multigpu_but_fix_me
def test_pack_dataset(self):
tokenizer = AutoTokenizer.from_pretrained("facebook/mbart-large-cc25")

Expand All @@ -111,6 +114,7 @@ def test_pack_dataset(self):
assert orig_paths == new_paths

@pytest.mark.skipif(not FAIRSEQ_AVAILABLE, reason="This test requires fairseq")
@require_torch_non_multigpu_but_fix_me
def test_dynamic_batch_size(self):
if not FAIRSEQ_AVAILABLE:
return
Expand All @@ -135,6 +139,7 @@ def test_dynamic_batch_size(self):
if failures:
raise AssertionError(f"too many tokens in {len(failures)} batches")

@require_torch_non_multigpu_but_fix_me
def test_sortish_sampler_reduces_padding(self):
ds, _, tokenizer = self._get_dataset(max_len=512)
bs = 2
Expand Down Expand Up @@ -174,6 +179,7 @@ def _get_dataset(self, n_obs=1000, max_len=128):
)
return ds, max_tokens, tokenizer

@require_torch_non_multigpu_but_fix_me
def test_distributed_sortish_sampler_splits_indices_between_procs(self):
ds, max_tokens, tokenizer = self._get_dataset()
ids1 = set(DistributedSortishSampler(ds, 256, num_replicas=2, rank=0, add_extra_examples=False))
Expand All @@ -189,6 +195,7 @@ def test_distributed_sortish_sampler_splits_indices_between_procs(self):
PEGASUS_XSUM,
],
)
@require_torch_non_multigpu_but_fix_me
def test_dataset_kwargs(self, tok_name):
tokenizer = AutoTokenizer.from_pretrained(tok_name)
if tok_name == MBART_TINY:
Expand Down
9 changes: 8 additions & 1 deletion examples/seq2seq/test_fsmt_bleu_score.py
Original file line number Diff line number Diff line change
Expand Up @@ -19,7 +19,13 @@

from parameterized import parameterized
from transformers import FSMTForConditionalGeneration, FSMTTokenizer
from transformers.testing_utils import get_tests_dir, require_torch, slow, torch_device
from transformers.testing_utils import (
get_tests_dir,
require_torch,
require_torch_non_multigpu_but_fix_me,
slow,
torch_device,
)
from utils import calculate_bleu


Expand Down Expand Up @@ -48,6 +54,7 @@ def get_model(self, mname):
]
)
@slow
@require_torch_non_multigpu_but_fix_me
def test_bleu_scores(self, pair, min_bleu_score):
# note: this test is not testing the best performance since it only evals a small batch
# but it should be enough to detect a regression in the output quality
Expand Down
7 changes: 6 additions & 1 deletion examples/seq2seq/test_make_student.py
Original file line number Diff line number Diff line change
Expand Up @@ -4,7 +4,7 @@
from make_student import create_student_by_copying_alternating_layers
from transformers import AutoConfig
from transformers.file_utils import cached_property
from transformers.testing_utils import require_torch
from transformers.testing_utils import require_torch, require_torch_non_multigpu_but_fix_me


TINY_BART = "sshleifer/bart-tiny-random"
Expand All @@ -17,23 +17,28 @@ class MakeStudentTester(unittest.TestCase):
def teacher_config(self):
return AutoConfig.from_pretrained(TINY_BART)

@require_torch_non_multigpu_but_fix_me
def test_valid_t5(self):
student, *_ = create_student_by_copying_alternating_layers(TINY_T5, tempfile.mkdtemp(), e=1, d=1)
self.assertEqual(student.config.num_hidden_layers, 1)

@require_torch_non_multigpu_but_fix_me
def test_asymmetric_t5(self):
student, *_ = create_student_by_copying_alternating_layers(TINY_T5, tempfile.mkdtemp(), e=1, d=None)

@require_torch_non_multigpu_but_fix_me
def test_same_decoder_small_encoder(self):
student, *_ = create_student_by_copying_alternating_layers(TINY_BART, tempfile.mkdtemp(), e=1, d=None)
self.assertEqual(student.config.encoder_layers, 1)
self.assertEqual(student.config.decoder_layers, self.teacher_config.encoder_layers)

@require_torch_non_multigpu_but_fix_me
def test_small_enc_small_dec(self):
student, *_ = create_student_by_copying_alternating_layers(TINY_BART, tempfile.mkdtemp(), e=1, d=1)
self.assertEqual(student.config.encoder_layers, 1)
self.assertEqual(student.config.decoder_layers, 1)

@require_torch_non_multigpu_but_fix_me
def test_raises_assert(self):
with self.assertRaises(AssertionError):
create_student_by_copying_alternating_layers(TINY_BART, tempfile.mkdtemp(), e=None, d=None)
21 changes: 20 additions & 1 deletion examples/seq2seq/test_seq2seq_examples.py
Original file line number Diff line number Diff line change
Expand Up @@ -19,7 +19,14 @@
from run_eval_search import run_search
from transformers import AutoConfig, AutoModelForSeq2SeqLM
from transformers.hf_api import HfApi
from transformers.testing_utils import CaptureStderr, CaptureStdout, TestCasePlus, require_torch_gpu, slow
from transformers.testing_utils import (
CaptureStderr,
CaptureStdout,
TestCasePlus,
require_torch_gpu,
require_torch_non_multigpu_but_fix_me,
slow,
)
from utils import ROUGE_KEYS, label_smoothed_nll_loss, lmap, load_json


Expand Down Expand Up @@ -126,6 +133,7 @@ def setUpClass(cls):

@slow
@require_torch_gpu
@require_torch_non_multigpu_but_fix_me
def test_hub_configs(self):
"""I put require_torch_gpu cause I only want this to run with self-scheduled."""

Expand All @@ -143,10 +151,12 @@ def test_hub_configs(self):
failures.append(m)
assert not failures, f"The following models could not be loaded through AutoConfig: {failures}"

@require_torch_non_multigpu_but_fix_me
def test_distill_no_teacher(self):
updates = dict(student_encoder_layers=2, student_decoder_layers=1, no_teacher=True)
self._test_distiller_cli(updates)

@require_torch_non_multigpu_but_fix_me
def test_distill_checkpointing_with_teacher(self):
updates = dict(
student_encoder_layers=2,
Expand All @@ -171,6 +181,7 @@ def test_distill_checkpointing_with_teacher(self):
convert_pl_to_hf(ckpts[0], transformer_ckpts[0].parent, out_path_new)
assert os.path.exists(os.path.join(out_path_new, "pytorch_model.bin"))

@require_torch_non_multigpu_but_fix_me
def test_loss_fn(self):
model = AutoModelForSeq2SeqLM.from_pretrained(BART_TINY, return_dict=True)
input_ids, mask = model.dummy_inputs["input_ids"], model.dummy_inputs["attention_mask"]
Expand All @@ -191,6 +202,7 @@ def test_loss_fn(self):
# TODO: understand why this breaks
self.assertEqual(nll_loss, model_computed_loss)

@require_torch_non_multigpu_but_fix_me
def test_distill_mbart(self):
updates = dict(
student_encoder_layers=2,
Expand All @@ -215,6 +227,7 @@ def test_distill_mbart(self):
assert len(all_files) > 2
self.assertEqual(len(transformer_ckpts), 2)

@require_torch_non_multigpu_but_fix_me
def test_distill_t5(self):
updates = dict(
student_encoder_layers=1,
Expand Down Expand Up @@ -296,18 +309,21 @@ def run_eval_tester(self, model):

# test one model to quickly (no-@slow) catch simple problems and do an
# extensive testing of functionality with multiple models as @slow separately
@require_torch_non_multigpu_but_fix_me
def test_run_eval(self):
self.run_eval_tester(T5_TINY)

# any extra models should go into the list here - can be slow
@parameterized.expand([BART_TINY, MBART_TINY])
@slow
@require_torch_non_multigpu_but_fix_me
def test_run_eval_slow(self, model):
self.run_eval_tester(model)

# testing with 2 models to validate: 1. translation (t5) 2. summarization (mbart)
@parameterized.expand([T5_TINY, MBART_TINY])
@slow
@require_torch_non_multigpu_but_fix_me
def test_run_eval_search(self, model):
input_file_name = Path(self.get_auto_remove_tmp_dir()) / "utest_input.source"
output_file_name = input_file_name.parent / "utest_output.txt"
Expand Down Expand Up @@ -358,6 +374,7 @@ def test_run_eval_search(self, model):
@parameterized.expand(
[T5_TINY, BART_TINY, MBART_TINY, MARIAN_TINY, FSMT_TINY],
)
@require_torch_non_multigpu_but_fix_me
def test_finetune(self, model):
args_d: dict = CHEAP_ARGS.copy()
task = "translation" if model in [MBART_TINY, MARIAN_TINY, FSMT_TINY] else "summarization"
Expand Down Expand Up @@ -409,6 +426,7 @@ def test_finetune(self, model):
assert isinstance(example_batch, dict)
assert len(example_batch) >= 4

@require_torch_non_multigpu_but_fix_me
def test_finetune_extra_model_args(self):
args_d: dict = CHEAP_ARGS.copy()

Expand Down Expand Up @@ -459,6 +477,7 @@ def test_finetune_extra_model_args(self):
model = main(args)
assert str(excinfo.value) == f"model config doesn't have a `{unsupported_param}` attribute"

@require_torch_non_multigpu_but_fix_me
def test_finetune_lr_schedulers(self):
args_d: dict = CHEAP_ARGS.copy()

Expand Down
Loading