New dataloader arguments for deleting problematic images (#30)

bghira · bghira · web-flow · commit a61603dd3865 · 2023-08-11T22:07:36.000-07:00
* DataLoader: make it optional to delete unwanted images, off by default
* Arguments: add terminal SNR parameters for tweaking, rather than being baked-in
* Make terminal SNR opt-in

---------

Co-authored-by: bghira &lt;bghira@users.github.com&gt;
diff --git a/helpers/arguments.py b/helpers/arguments.py
@@ -2,7 +2,9 @@
 
 
 def parse_args(input_args=None):
-    parser = argparse.ArgumentParser(description="The following SimpleTuner command-line options are available:")
+    parser = argparse.ArgumentParser(
+        description="The following SimpleTuner command-line options are available:"
+    )
     parser.add_argument(
         "--snr_gamma",
         type=float,
@@ -34,6 +36,34 @@ def parse_args(input_args=None):
             " SD 1.5 is epsilon."
         ),
     )
+    parser.add_argument(
+        '--training_scheduler_timestep_spacing',
+        type=str,
+        default="leading",
+        choices=["leading", "linspace", "trailing"],
+        help=(
+            "Spacing timesteps can fundamentally alter the course of history. Er, I mean, your model weights."
+            " For all training, including terminal SNR, it would seem that 'leading' is the right choice."
+            " However, for inference in terminal SNR models, 'trailing' is the correct choice."
+        )
+    )
+    parser.add_argument(
+        '--inference_scheduler_timestep_spacing',
+        type=str,
+        default="trailing",
+        choices=["leading", "linspace", "trailing"],
+        help=(
+            "The Bytedance paper on zero terminal SNR recommends inference using 'trailing'."
+        )
+    )
+    parser.add_argument(
+        '--rescale_betas_zero_snr',
+        action="store_true",
+        help=(
+            "If set, will rescale the betas to zero terminal SNR. This is recommended for training with v_prediction."
+            " For epsilon, this might help with fine details, but will not result in contrast improvements."
+        )
+    )
     parser.add_argument(
         "--vae_dtype",
         type=str,
@@ -113,13 +143,13 @@ def parse_args(input_args=None):
         "--seen_state_path",
         type=str,
         default=None,
-        help="Where the JSON document containing the state of the seen images is stored. This helps ensure we do not repeat images too many times."
+        help="Where the JSON document containing the state of the seen images is stored. This helps ensure we do not repeat images too many times.",
     )
     parser.add_argument(
         "--state_path",
         type=str,
         default=None,
-        help="A JSON document containing the current state of training, will be placed here."
+        help="A JSON document containing the current state of training, will be placed here.",
     )
     parser.add_argument(
         "--caption_strategy",
@@ -156,6 +186,15 @@ def parse_args(input_args=None):
             " resolution"
         ),
     )
+    parser.add_argument(
+        "--minimum_image_size",
+        type=int,
+        default=768,
+        help=(
+            "The minimum resolution for both sides of input images."
+            " If --delete_unwanted_images is set, images smaller than this will be DELETED."
+        ),
+    )
     parser.add_argument(
         "--crops_coords_top_left_h",
         type=int,
@@ -235,9 +274,7 @@ def parse_args(input_args=None):
         "--checkpoints_total_limit",
         type=int,
         default=None,
-        help=(
-            "Max number of checkpoints to store."
-        ),
+        help=("Max number of checkpoints to store."),
     )
     parser.add_argument(
         "--resume_from_checkpoint",
@@ -299,7 +336,9 @@ def parse_args(input_args=None):
         help="Power factor of the polynomial scheduler.",
     )
     parser.add_argument(
-        "--use_ema", action="store_true", help="Whether to use EMA (exponential moving average) model."
+        "--use_ema",
+        action="store_true",
+        help="Whether to use EMA (exponential moving average) model.",
     )
     parser.add_argument(
         "--non_ema_revision",
@@ -485,13 +524,13 @@ def parse_args(input_args=None):
         help="Run validation every X epochs.",
     )
     parser.add_argument(
-        '--validation_guidance',
+        "--validation_guidance",
         type=float,
         default=7.5,
         help="CFG value for validation images. Default: 7.5",
     )
     parser.add_argument(
-        '--validation_guidance_rescale',
+        "--validation_guidance_rescale",
         type=float,
         default=0.0,
         help="CFG rescale value for validation images. Default: 0.0, max 1.0",
@@ -593,7 +632,15 @@ def parse_args(input_args=None):
         help=(
             "When this option is provided, image cropping and processing will be disabled."
             " It is a good idea to use this with caution, for training multiple aspect ratios."
-        )
+        ),
+    )
+    parser.add_argument(
+        "--delete_unwanted_images",
+        action="store_true",
+        help=(
+            "If set, will delete images that are not of a minimum size to save on disk space for large training runs."
+            " Default behaviour: Unset, remove images from bucket only."
+        ),
     )
     parser.add_argument(
         "--offset_noise",
diff --git a/helpers/aspect_bucket.py b/helpers/aspect_bucket.py
@@ -20,33 +20,38 @@
 class BalancedBucketSampler(torch.utils.data.Sampler):
     def __init__(
         self,
-        aspect_ratio_bucket_indices,
+        aspect_ratio_bucket_indices: dict,
         batch_size: int = 15,
         seen_images_path: str = "/notebooks/SimpleTuner/seen_images.json",
         state_path: str = "/notebooks/SimpleTuner/bucket_sampler_state.json",
-        reset_threshold: int = 5000,  # Add a reset_threshold
+        reset_threshold: int = 5000,
         debug_aspect_buckets: bool = False,
+        delete_unwanted_images: bool = False,
+        minimum_image_size: int = None,
     ):
         """
-        Initialize the BalancedBucketSampler instance.
+        Initializes the sampler with provided settings.
 
-        Args:
-            aspect_ratio_bucket_indices (dict): A dictionary mapping aspect ratios to image paths.
-            batch_size (int): The number of images per sample during training.
-            seen_images_path (str): The path to save/load the seen images.
-            state_path (str): The path to save/load the state of the sampler.
-            reset_threshold (int): The number of seen images to trigger a reset.
-            debug_aspect_buckets (bool): If True, enable debug logging.
+        Parameters:
+        - aspect_ratio_bucket_indices: Dictionary containing aspect ratios as keys and list of image paths as values.
+        - batch_size: Number of samples to draw per batch.
+        - seen_images_path: Path to store the seen images.
+        - state_path: Path to store the current state of the sampler.
+        - reset_threshold: The threshold after which the seen images list should be reset.
+        - debug_aspect_buckets: Flag to log state for debugging purposes.
+        - delete_unwanted_images: Flag to decide whether to delete unwanted (small) images or just remove from the bucket.
         """
         self.aspect_ratio_bucket_indices = aspect_ratio_bucket_indices
-        self.buckets = self.load_buckets()
+        self.buckets = list(self.aspect_ratio_bucket_indices.keys())
         self.exhausted_buckets = []
         self.batch_size = batch_size
-        self.current_bucket = 0
         self.seen_images_path = seen_images_path
         self.state_path = state_path
         self.reset_threshold = reset_threshold
         self.debug_aspect_buckets = debug_aspect_buckets
+        self.delete_unwanted_images = delete_unwanted_images
+        self.current_bucket = 0
+        self.minimum_image_size = minimum_image_size
         self.seen_images = self.load_seen_images()
 
     def save_state(self):
@@ -85,13 +90,16 @@ def remove_image(self, image_path, bucket):
             self.aspect_ratio_bucket_indices[bucket].remove(image_path)
 
     def handle_small_image(self, image_path, bucket):
-        logger.warning(f"Image too small: DELETING image and continuing search.")
-        # try:
-        #     os.remove(image_path)
-        # except Exception as e:
-        #     logger.warning(
-        #         f"The image was already deleted. Another GPU must have gotten to it."
-        #     )
+        if self.delete_unwanted_images:
+            try:
+                logger.warning(f"Image too small: DELETING image and continuing search.")
+                os.remove(image_path)
+            except Exception as e:
+                logger.warning(
+                    f"The image was already deleted. Another GPU must have gotten to it."
+                )
+        else:
+            logger.warning(f"Image too small, but --delete_unwanted_images is not provided, so we simply ignore and remove from bucket.")
         self.remove_image(image_path, bucket)
 
     def handle_incorrect_bucket(self, image_path, bucket, actual_bucket):
@@ -194,7 +202,7 @@ def __iter__(self):
                 except:
                     logger.warning(f"Image was bad or in-progress: {image_path}")
                     continue
-                if image.width < 880 or image.height < 880:
+                if image.width < self.minimum_image_size or image.height < self.minimum_image_size:
                     image.close()
                     self.handle_small_image(image_path, bucket)
                     continue
diff --git a/sdxl-env.sh.example b/sdxl-env.sh.example
@@ -87,4 +87,9 @@ export TRAINER_EXTRA_ARGS="--allow_tf32 --use_8bit_adam --use_ema"  # anything y
 
 # These are pretty sketchy to change. --use_original_images can be removed to enable image cropping. Not tested for SDXL.
 export TRAINER_EXTRA_ARGS="${TRAINER_EXTRA_ARGS} --enable_xformers_memory_efficient_attention --use_original_images=true"
-export TRAINER_EXTRA_ARGS="${TRAINER_EXTRA_ARGS} --gradient_checkpointing --gradient_accumulation_steps=${GRADIENT_ACCUMULATION_STEPS}"
+export TRAINER_EXTRA_ARGS="${TRAINER_EXTRA_ARGS} --gradient_checkpointing --gradient_accumulation_steps=${GRADIENT_ACCUMULATION_STEPS}"
+
+## For terminal SNR training:
+
+#export TRAINER_EXTRA_ARGS="${TRAINER_EXTRA_ARGS} --prediction_type=v_prediction --rescale_betas_zero_snr"
+#export TRAINER_EXTRA_ARGS="${TRAINER_EXTRA_ARGS} --training_scheduler_timestep_spacing=leading --inference_scheduler_timestep_spacing=trailing"
diff --git a/train_sdxl.py b/train_sdxl.py
@@ -417,12 +417,14 @@ def tokenize_captions(captions, tokenizer):
         args.pretrained_model_name_or_path,
         subfolder="scheduler",
         prediction_type=args.prediction_type,
-        rescale_betas_zero_snr=True,
+        timestep_spacing=args.training_scheduler_timestep_spacing,
+        rescale_betas_zero_snr=args.rescale_betas_zero_snr,
     )
     noise_scheduler = DDPMScheduler.from_pretrained(
         args.pretrained_model_name_or_path,
         subfolder="scheduler",
         prediction_type=args.prediction_type,
+        timestep_spacing=args.training_scheduler_timestep_spacing,
         trained_betas=betas_scheduler.betas.numpy().tolist(),
     )
     text_encoder_1 = text_encoder_cls_1.from_pretrained(
@@ -554,6 +556,8 @@ def collate_fn(examples):
         seen_images_path=args.seen_state_path,
         state_path=args.state_path,
         debug_aspect_buckets=args.debug_aspect_buckets,
+        delete_unwanted_images=args.delete_unwanted_images,
+        minimum_image_size=args.minimum_image_size
     )
     logger.info("Plugging sampler into dataloader")
     train_dataloader = torch.utils.data.DataLoader(
@@ -938,7 +942,13 @@ def collate_fn(examples):
                         revision=args.revision,
                         torch_dtype=weight_dtype,
                     )
-                    pipeline.scheduler.config.prediction_type = args.prediction_type or noise_scheduler.config.prediction_type
+                    pipeline.scheduler = DDIMScheduler.from_pretrained(
+                        args.pretrained_model_name_or_path,
+                        subfolder="scheduler",
+                        prediction_type=args.prediction_type,
+                        timestep_spacing=args.inference_scheduler_timestep_spacing,
+                        rescale_betas_zero_snr=args.rescale_betas_zero_snr,
+                    )
                     pipeline = pipeline.to(accelerator.device)
                     pipeline.set_progress_bar_config(disable=True)