Update: add using pcm bytes (#4323) (#4409)

YooSungHyun · lhoestq · web-flow · commit 693418aa02b3 · 2022-07-07T15:16:08.000+02:00
* Update: add using pcm bytes

* re make style

* Update src/datasets/features/audio.py

Co-authored-by: Quentin Lhoest &lt;42851186+lhoestq@users.noreply.github.com&gt;

* Update src/datasets/features/audio.py

Co-authored-by: Quentin Lhoest &lt;42851186+lhoestq@users.noreply.github.com&gt;

* Update src/datasets/features/audio.py

Co-authored-by: Quentin Lhoest &lt;42851186+lhoestq@users.noreply.github.com&gt;

* delete: wrong comment

* Update: sampling_rate usage &amp; test source update

* Update: pcm2wav bytes don`t need path

we can open up soundfile lib

Co-authored-by: Quentin Lhoest &lt;42851186+lhoestq@users.noreply.github.com&gt;

* Update: we can get wav style bytes to pcm, so we can read to soundfile lib

* Update: pcm doesn`t use path, so check 'None'

* Update: not used self`s sampling_rate

self.sampling_rate is for decode. so, we have to get value`s sampling_rate

Co-authored-by: Quentin Lhoest &lt;42851186+lhoestq@users.noreply.github.com&gt;

* Update: add sampling_rate

we have to know sampling_rate in input values variable

Co-authored-by: Quentin Lhoest &lt;42851186+lhoestq@users.noreply.github.com&gt;

* Update: sampling_rate variable

Co-authored-by: Quentin Lhoest &lt;42851186+lhoestq@users.noreply.github.com&gt;

* Update tests/features/test_audio.py

Co-authored-by: Quentin Lhoest &lt;42851186+lhoestq@users.noreply.github.com&gt;

* Update tests/features/test_audio.py

Co-authored-by: Quentin Lhoest &lt;42851186+lhoestq@users.noreply.github.com&gt;

* Update tests/features/test_audio.py

Co-authored-by: Quentin Lhoest &lt;42851186+lhoestq@users.noreply.github.com&gt;

* Update: replace get sampling_rate

* Apply suggestions from code review

Co-authored-by: Quentin Lhoest &lt;42851186+lhoestq@users.noreply.github.com&gt;
diff --git a/src/datasets/features/audio.py b/src/datasets/features/audio.py
@@ -3,6 +3,7 @@
 from io import BytesIO
 from typing import TYPE_CHECKING, Any, ClassVar, Dict, Optional, Union
 
+import numpy as np
 import pyarrow as pa
 from packaging import version
 
@@ -92,7 +93,22 @@ def encode_example(self, value: Union[str, dict]) -> dict:
             return {"bytes": buffer.getvalue(), "path": None}
         elif value.get("path") is not None and os.path.isfile(value["path"]):
             # we set "bytes": None to not duplicate the data if they're already available locally
-            return {"bytes": None, "path": value.get("path")}
+            if value["path"].endswith("pcm"):
+                # "PCM" only has raw audio bytes
+                if value.get("sampling_rate") is None:
+                    # At least, If you want to convert "PCM-byte" to "WAV-byte", you have to know sampling rate
+                    raise KeyError("To use PCM files, please specify a 'sampling_rate' in Audio object")
+                if value.get("bytes"):
+                    # If we already had PCM-byte, we don`t have to make "read file, make bytes" (just use it!)
+                    bytes_value = np.frombuffer(value["bytes"], dtype=np.int16).astype(np.float32) / 32767
+                else:
+                    bytes_value = np.memmap(value["path"], dtype="h", mode="r").astype(np.float32) / 32767
+
+                buffer = BytesIO(bytes())
+                sf.write(buffer, bytes_value, value["sampling_rate"], format="wav")
+                return {"bytes": buffer.getvalue(), "path": None}
+            else:
+                return {"bytes": None, "path": value.get("path")}
         elif value.get("bytes") is not None or value.get("path") is not None:
             # store the audio bytes, and path is used to infer the audio format using the file extension
             return {"bytes": value.get("bytes"), "path": value.get("path")}
diff --git a/tests/features/data/test_audio_16000.pcm b/tests/features/data/test_audio_16000.pcm
diff --git a/tests/features/test_audio.py b/tests/features/test_audio.py
@@ -78,6 +78,26 @@ def test_audio_feature_encode_example(shared_datadir, build_example):
     assert decoded_example.keys() == {"path", "array", "sampling_rate"}
 
 
+@pytest.mark.parametrize(
+    "build_example",
+    [
+        lambda audio_path: {"path": audio_path, "sampling_rate": 16_000},
+        lambda audio_path: {"path": audio_path, "bytes": None, "sampling_rate": 16_000},
+        lambda audio_path: {"path": audio_path, "bytes": open(audio_path, "rb").read(), "sampling_rate": 16_000},
+        lambda audio_path: {"array": [0.1, 0.2, 0.3], "sampling_rate": 16_000},
+    ],
+)
+def test_audio_feature_encode_example_pcm(shared_datadir, build_example):
+    audio_path = str(shared_datadir / "test_audio_16000.pcm")
+    audio = Audio(sampling_rate=16_000)
+    encoded_example = audio.encode_example(build_example(audio_path))
+    assert isinstance(encoded_example, dict)
+    assert encoded_example.keys() == {"bytes", "path"}
+    assert encoded_example["bytes"] is not None or encoded_example["path"] is not None
+    decoded_example = audio.decode_example(encoded_example)
+    assert decoded_example.keys() == {"path", "array", "sampling_rate"}
+
+
 @require_sndfile
 def test_audio_decode_example(shared_datadir):
     audio_path = str(shared_datadir / "test_audio_44100.wav")
@@ -126,6 +146,18 @@ def test_audio_decode_example_opus(shared_datadir):
     assert decoded_example["sampling_rate"] == 48000
 
 
+@pytest.mark.parametrize("sampling_rate", [16_000, 48_000])
+def test_audio_decode_example_pcm(shared_datadir, sampling_rate):
+    audio_path = str(shared_datadir / "test_audio_16000.pcm")
+    audio_input = {"path": audio_path, "sampling_rate": 16_000}
+    audio = Audio(sampling_rate=sampling_rate)
+    decoded_example = audio.decode_example(audio.encode_example(audio_input))
+    assert decoded_example.keys() == {"path", "array", "sampling_rate"}
+    assert decoded_example["path"] is None
+    assert decoded_example["array"].shape == (16208 * sampling_rate // 16_000,)
+    assert decoded_example["sampling_rate"] == sampling_rate
+
+
 @require_sox
 @require_torchaudio
 def test_audio_resampling_mp3_different_sampling_rates(shared_datadir):