From 69d163a4c725adc2758b5fe61f9f6e23e16d9c7e Mon Sep 17 00:00:00 2001
From: cocktailpeanut <cocktailpeanuts@proton.me>
Date: Tue, 18 Jun 2024 18:56:08 -0400
Subject: [PATCH 1/3] WebUI + Audio Fix

1. audio fix: explicitly specify the audio codec in `util.py`, otherwise the video is technically corrupt and doesn't play sound
2. web ui: gradio web ui
3. print the current step while running inference

gradio
---
 hallo/utils/util.py  |  2 +-
 requirements.txt     |  3 ++-
 scripts/app.py       | 49 ++++++++++++++++++++++++++++++++++++++++++++
 scripts/inference.py |  2 ++
 4 files changed, 54 insertions(+), 2 deletions(-)
 create mode 100644 scripts/app.py

diff --git a/hallo/utils/util.py b/hallo/utils/util.py
index 3a460f7c..f4b6563a 100644
--- a/hallo/utils/util.py
+++ b/hallo/utils/util.py
@@ -315,7 +315,7 @@ def make_frame(t):
     new_video_clip = VideoClip(make_frame, duration=tensor.shape[0] / fps)
     audio_clip = AudioFileClip(audio_source).subclip(0, tensor.shape[0] / fps)
     new_video_clip = new_video_clip.set_audio(audio_clip)
-    new_video_clip.write_videofile(output_video_file, fps=fps)
+    new_video_clip.write_videofile(output_video_file, fps=fps, audio_codec='aac')
 
 
 silhouette_ids = [
diff --git a/requirements.txt b/requirements.txt
index 40eff183..7c3c5dcd 100644
--- a/requirements.txt
+++ b/requirements.txt
@@ -27,4 +27,5 @@ transformers==4.39.2
 xformers==0.0.25.post1
 isort==5.13.2
 pylint==3.2.2
-pre-commit==3.7.1
\ No newline at end of file
+pre-commit==3.7.1
+gradio==4.36.1
diff --git a/scripts/app.py b/scripts/app.py
new file mode 100644
index 00000000..522e04da
--- /dev/null
+++ b/scripts/app.py
@@ -0,0 +1,49 @@
+from inference import inference_process
+import argparse
+import gradio as gr
+from omegaconf import OmegaConf
+def predict(image, audio, size, steps, fps, cfg, pose_weight, face_weight, lip_weight, face_expand_ratio):
+  dict = {
+    'data': {
+      'source_image': {
+        'width': size,
+        'height': size
+      },
+      'export_video': {
+        'fps': fps
+      }
+    },
+    'cfg_scale': cfg,
+    'source_image': image,
+    'driving_audio': audio,
+    'pose_weight': pose_weight,
+    'face_weight': face_weight,
+    'lip_weight': lip_weight,
+    'face_expand_ratio': face_expand_ratio,
+    'config': 'configs/inference/default.yaml',
+    'checkpoint': None,
+    'output': ".cache/output.mp4",
+    'inference_steps': steps
+  }
+  args = argparse.Namespace()
+  for key, value in dict.items():
+      setattr(args, key, value)
+  return inference_process(args)
+
+app = gr.Interface(
+    fn=predict,
+    inputs=[
+      gr.Image(label="source image (no webp)", type="filepath", format="jpeg"),
+      gr.Audio(label="source audio", type="filepath"),
+      gr.Number(label="size", value=512, minimum=256, maximum=512, step=64, precision=0),
+      gr.Number(label="steps", value=40, minimum=1, step=1, precision=0),
+      gr.Number(label="fps", value=25, minimum=1, step=1, precision=0),
+      gr.Slider(label="CFG Scale", value=3.5, minimum=0, maximum=10, step=0.01),
+      gr.Number(label="pose weight", value=1.0),
+      gr.Number(label="face weight", value=1.0),
+      gr.Number(label="lip weight", value=1.0),
+      gr.Number(label="face expand ratio", value=1.2),
+    ],
+    outputs=[gr.Video()],
+)
+app.launch()
diff --git a/scripts/inference.py b/scripts/inference.py
index 8bbc5cc8..c2ef0bbb 100644
--- a/scripts/inference.py
+++ b/scripts/inference.py
@@ -288,6 +288,7 @@ def inference_process(args: argparse.Namespace):
     generator = torch.manual_seed(42)
 
     for t in range(times):
+        print(f"[{t+1}/{times}]")
 
         if len(tensor_result) == 0:
             # The first iteration
@@ -342,6 +343,7 @@ def inference_process(args: argparse.Namespace):
     output_file = config.output
     # save the result after all iteration
     tensor_to_video(tensor_result, output_file, driving_audio_path)
+    return output_file
 
 
 if __name__ == "__main__":

From d2bae3df1bb1dfebaa63a75aaf91e0824478028b Mon Sep 17 00:00:00 2001
From: cocktailpeanut <cocktailpeanuts@proton.me>
Date: Wed, 19 Jun 2024 13:40:12 -0400
Subject: [PATCH 2/3] lint

---
 scripts/app.py | 69 ++++++++++++++++++++++++++++++--------------------
 1 file changed, 41 insertions(+), 28 deletions(-)

diff --git a/scripts/app.py b/scripts/app.py
index 522e04da..5581f25f 100644
--- a/scripts/app.py
+++ b/scripts/app.py
@@ -1,34 +1,47 @@
-from inference import inference_process
+"""
+This script is a gradio web ui.
+
+The script takes an image and an audio clip, and lets you configure all the
+variables such as cfg_scale, pose_weight, face_weight, lip_weight, etc.
+
+Usage:
+This script can be run from the command line with the following command:
+
+python scripts/app.py
+"""
 import argparse
+from inference import inference_process
 import gradio as gr
-from omegaconf import OmegaConf
 def predict(image, audio, size, steps, fps, cfg, pose_weight, face_weight, lip_weight, face_expand_ratio):
-  dict = {
-    'data': {
-      'source_image': {
-        'width': size,
-        'height': size
-      },
-      'export_video': {
-        'fps': fps
-      }
-    },
-    'cfg_scale': cfg,
-    'source_image': image,
-    'driving_audio': audio,
-    'pose_weight': pose_weight,
-    'face_weight': face_weight,
-    'lip_weight': lip_weight,
-    'face_expand_ratio': face_expand_ratio,
-    'config': 'configs/inference/default.yaml',
-    'checkpoint': None,
-    'output': ".cache/output.mp4",
-    'inference_steps': steps
-  }
-  args = argparse.Namespace()
-  for key, value in dict.items():
-      setattr(args, key, value)
-  return inference_process(args)
+    """
+    Create a gradio interface with the configs.
+    """
+    config = {
+        'data': {
+            'source_image': {
+                'width': size,
+                'height': size
+            },
+            'export_video': {
+                'fps': fps
+            }
+        },
+        'cfg_scale': cfg,
+        'source_image': image,
+        'driving_audio': audio,
+        'pose_weight': pose_weight,
+        'face_weight': face_weight,
+        'lip_weight': lip_weight,
+        'face_expand_ratio': face_expand_ratio,
+        'config': 'configs/inference/default.yaml',
+        'checkpoint': None,
+        'output': ".cache/output.mp4",
+        'inference_steps': steps
+    }
+    args = argparse.Namespace()
+    for key, value in config.items():
+        setattr(args, key, value)
+    return inference_process(args)
 
 app = gr.Interface(
     fn=predict,

From bb90dd7477b26466d3b243ef3049b2930df83bcb Mon Sep 17 00:00:00 2001
From: cocktailpeanut <cocktailpeanuts@proton.me>
Date: Wed, 19 Jun 2024 20:14:08 -0400
Subject: [PATCH 3/3] update

---
 scripts/app.py | 5 ++++-
 1 file changed, 4 insertions(+), 1 deletion(-)

diff --git a/scripts/app.py b/scripts/app.py
index 5581f25f..e106c02b 100644
--- a/scripts/app.py
+++ b/scripts/app.py
@@ -10,8 +10,11 @@
 python scripts/app.py
 """
 import argparse
-from inference import inference_process
+
 import gradio as gr
+from inference import inference_process
+
+
 def predict(image, audio, size, steps, fps, cfg, pose_weight, face_weight, lip_weight, face_expand_ratio):
     """
     Create a gradio interface with the configs.