Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
8 changes: 8 additions & 0 deletions runtime/android/.gitignore
Original file line number Diff line number Diff line change
@@ -0,0 +1,8 @@
.gradle/
build/
local.properties
*.iml
.idea/
.cxx/
.externalNativeBuild/
captures/
41 changes: 41 additions & 0 deletions runtime/android/README.md
Original file line number Diff line number Diff line change
@@ -0,0 +1,41 @@
# WeSpeaker Android Speaker Verification Demo

This app extracts speaker embeddings from two **16 kHz PCM WAV** clips on device, computes cosine similarity (same mapping to 0–1 as desktop `runtime/onnxruntime` `asv_main`), and compares against a threshold to decide same vs different speaker.

## ONNX model

Export ONNX on a PC following the repo docs:

```bash
python wespeaker/bin/export_onnx.py \
--config $exp/config.yaml \
--checkpoint $exp/avg_model.pt \
--output_model final.onnx
```

Copy `final.onnx` to `app/src/main/assets/final.onnx` and build (filename must match).

## Build

**JDK 17 or newer** is required (Android Gradle Plugin 8.x). If only Java 8 is installed, install JDK 17 or pick the bundled JDK under Android Studio *Settings → Build → Gradle → Gradle JDK*.

Open `runtime/android` in Android Studio, or use the Gradle wrapper:

```bash
cd runtime/android
./gradlew :app:assembleDebug
```

The app depends on [ONNX Runtime Android](https://github.com/microsoft/onnxruntime) (`onnxruntime-android` AAR). Native integration matches [wekws/runtime/android](https://github.com/wenet-e2e/wekws/tree/main/runtime/android): a resolvable `extractForNativeBuild` configuration unpacks `headers/` and `jni/` from the AAR; CMake uses `include_directories` and links `libonnxruntime.so` (no Prefab / `find_package`). App logic reuses this repo’s `runtime/core` Fbank, `SpeakerEngine`, and ONNX backend.

## Usage

1. After installing the APK, pick enroll and test WAV files (16 kHz recommended; other rates are not resampled in-app and may hurt quality).
2. Tune threshold, embedding dim, and chunk samples to match training/export settings.
3. Tap **Compare** to see similarity score and same/different verdict.

## Notes

- If CMake reports missing `onnxruntime*.aar` extract dir, **Sync** and run a full **Build** so `extractAARForNativeBuild` runs before `configureCMake` (same idea as wekws).
- First inference copies the model from assets to app-private storage; ensure `assets/final.onnx` exists and is non-empty.
- Default threshold is 0.5; tune on your validation set.
4 changes: 4 additions & 0 deletions runtime/android/app/proguard-rules.pro
Original file line number Diff line number Diff line change
@@ -0,0 +1,4 @@
# WeSpeaker JNI
-keepclasseswithmembernames class com.wespeaker.app.WespeakerNative {
native <methods>;
}
23 changes: 23 additions & 0 deletions runtime/android/app/src/main/AndroidManifest.xml
Original file line number Diff line number Diff line change
@@ -0,0 +1,23 @@
<?xml version="1.0" encoding="utf-8"?>
<manifest xmlns:android="http://schemas.android.com/apk/res/android">

<uses-permission android:name="android.permission.RECORD_AUDIO" />

<application
android:allowBackup="true"
android:icon="@mipmap/ic_launcher"
android:label="@string/app_name"
android:roundIcon="@mipmap/ic_launcher_round"
android:supportsRtl="true"
android:theme="@style/Theme.Wespeaker">
<activity
android:name=".MainActivity"
android:exported="true"
android:windowSoftInputMode="adjustResize">
<intent-filter>
<action android:name="android.intent.action.MAIN" />
<category android:name="android.intent.category.LAUNCHER" />
</intent-filter>
</activity>
</application>
</manifest>
2 changes: 2 additions & 0 deletions runtime/android/app/src/main/assets/README.txt
Original file line number Diff line number Diff line change
@@ -0,0 +1,2 @@
Copy the final.onnx produced by export_onnx.py into this directory (filename must be final.onnx).
After rebuild and install, the app copies it from assets to internal storage for native inference.
55 changes: 55 additions & 0 deletions runtime/android/app/src/main/cpp/CMakeLists.txt
Original file line number Diff line number Diff line change
@@ -0,0 +1,55 @@
cmake_minimum_required(VERSION 3.22.1)
project(wespeaker_jni)

set(CMAKE_CXX_STANDARD 14)
set(CMAKE_POSITION_INDEPENDENT_CODE ON)

# runtime/core: from this dir, five levels up to runtime/, then into core/
set(CORE_DIR "${CMAKE_CURRENT_SOURCE_DIR}/../../../../../core")

include(deps.cmake)

# Same as wekws/runtime/android: headers and jni from Gradle-extracted AAR (no Prefab find_package).
# Typical path: app/build/onnxruntime-android-x.y.z.aar/
set(build_DIR "${CMAKE_CURRENT_SOURCE_DIR}/../../../build")
file(GLOB ORT_ROOT_LIST "${build_DIR}/onnxruntime*.aar")
list(LENGTH ORT_ROOT_LIST _ort_len)
if(_ort_len EQUAL 0)
message(FATAL_ERROR
"No ${build_DIR}/onnxruntime*.aar found. Run Gradle task extractAARForNativeBuild first (pulled in before configureCMake).")
endif()
list(GET ORT_ROOT_LIST 0 ORT_ROOT)
# onnxruntime-android AAR: headers live under headers/ (not headers/include/).
include_directories("${ORT_ROOT}/headers")
link_directories("${ORT_ROOT}/jni/${ANDROID_ABI}")

add_definitions(-DUSE_ONNX)

add_library(utils STATIC "${CORE_DIR}/utils/utils.cc")
target_include_directories(utils PUBLIC "${CORE_DIR}")
target_link_libraries(utils PUBLIC glog gflags)

add_library(frontend STATIC
"${CORE_DIR}/frontend/feature_pipeline.cc"
"${CORE_DIR}/frontend/fft.cc"
)
target_include_directories(frontend PUBLIC "${CORE_DIR}")
target_link_libraries(frontend PUBLIC utils)

add_library(speaker STATIC
"${CORE_DIR}/speaker/speaker_engine.cc"
"${CORE_DIR}/speaker/onnx_speaker_model.cc"
)
target_include_directories(speaker PUBLIC "${CORE_DIR}")
target_link_libraries(speaker PUBLIC frontend onnxruntime)

add_library(wespeaker_jni SHARED wespeaker_jni.cpp)
target_include_directories(wespeaker_jni PRIVATE "${CORE_DIR}")
target_link_libraries(wespeaker_jni
PRIVATE
speaker
onnxruntime
glog
gflags
log
)
34 changes: 34 additions & 0 deletions runtime/android/app/src/main/cpp/deps.cmake
Original file line number Diff line number Diff line change
@@ -0,0 +1,34 @@
# Android NDK: FetchContent for gflags / glog.
include(FetchContent)
set(FETCHCONTENT_QUIET ON)

FetchContent_Declare(gflags
URL https://github.com/gflags/gflags/archive/refs/tags/v2.3.0.zip
URL_HASH SHA256=ca732b5fd17bf3a27a01a6784b947cbe6323644ecc9e26bbe2117ec43bf7e13b)
FetchContent_MakeAvailable(gflags)

set(BUILD_TESTING OFF CACHE BOOL "" FORCE)
set(WITH_GFLAGS ON CACHE BOOL "" FORCE)

FetchContent_Declare(glog
URL https://github.com/google/glog/archive/v0.4.0.zip
URL_HASH SHA256=9e1b54eb2782f53cd8af107ecf08d2ab64b8d0dc2b7f5594472f3bd63ca85cdc)
FetchContent_GetProperties(glog)
if(NOT glog_POPULATED)
FetchContent_Populate(glog)
file(READ ${glog_SOURCE_DIR}/CMakeLists.txt _glog_cm)
# glog 0.4.0: bump cmake_minimum for CMake 4+; on Android, execinfo probe can pass but link fails.
string(REGEX REPLACE
"cmake_minimum_required[ ]*\\([ ]*VERSION[ ]+[^)]+\\)"
"cmake_minimum_required(VERSION 3.10)" _glog_cm "${_glog_cm}")
if(ANDROID)
string(REPLACE
"check_include_file (execinfo.h HAVE_EXECINFO_H)"
"if(ANDROID)\n set(HAVE_EXECINFO_H 0)\nelse()\n check_include_file (execinfo.h HAVE_EXECINFO_H)\nendif()"
_glog_cm "${_glog_cm}")
endif()
file(WRITE ${glog_SOURCE_DIR}/CMakeLists.txt "${_glog_cm}")
add_subdirectory(${glog_SOURCE_DIR} ${glog_BINARY_DIR})
endif()

include_directories(${gflags_BINARY_DIR}/include ${glog_SOURCE_DIR}/src ${glog_BINARY_DIR})
110 changes: 110 additions & 0 deletions runtime/android/app/src/main/cpp/wespeaker_jni.cpp
Original file line number Diff line number Diff line change
@@ -0,0 +1,110 @@
// Copyright 2023 Chengdong Liang (WeSpeaker runtime)
// SPDX-License-Identifier: Apache-2.0

#include <jni.h>
#include <memory>
#include <mutex>
#include <string>
#include <vector>

#include "frontend/wav.h"
#include "glog/logging.h"
#include "speaker/speaker_engine.h"

namespace {

std::once_flag g_glog_init;

void EnsureGlog() {
std::call_once(g_glog_init, []() {
google::InitGoogleLogging("wespeaker");
FLAGS_logtostderr = 1;
FLAGS_minloglevel = 2;
});
}

jfloatArray MakeFloatArray(JNIEnv* env, float a, float b) {
jfloatArray out = env->NewFloatArray(2);
if (!out) return nullptr;
jfloat buf[2] = {a, b};
env->SetFloatArrayRegion(out, 0, 2, buf);
return out;
}

void ThrowIo(JNIEnv* env, const char* msg) {
jclass ex = env->FindClass("java/io/IOException");
if (ex) env->ThrowNew(ex, msg);
}

std::string JStringToUtf8(JNIEnv* env, jstring s) {
if (!s) return {};
const char* p = env->GetStringUTFChars(s, nullptr);
std::string out(p ? p : "");
if (p) env->ReleaseStringUTFChars(s, p);
return out;
}

} // namespace

extern "C" JNIEXPORT jfloatArray JNICALL
Java_com_wespeaker_app_WespeakerNative_compare(JNIEnv* env, jclass /* clazz */,
jstring j_enroll, jstring j_test,
jstring j_model,
jdouble j_threshold,
jint j_fbank_dim,
jint j_sample_rate) {
EnsureGlog();

const std::string enroll_path = JStringToUtf8(env, j_enroll);
const std::string test_path = JStringToUtf8(env, j_test);
const std::string model_path = JStringToUtf8(env, j_model);
const float threshold = static_cast<float>(j_threshold);

if (enroll_path.empty() || test_path.empty() || model_path.empty()) {
ThrowIo(env, "路径不能为空");
return nullptr;
}

try {
wenet::WavReader enroll_reader;
if (!enroll_reader.Open(enroll_path)) {
ThrowIo(env, "无法打开注册音频(需有效 WAV)");
return nullptr;
}
wenet::WavReader test_reader;
if (!test_reader.Open(test_path)) {
ThrowIo(env, "无法打开测试音频(需有效 WAV)");
return nullptr;
}
if (enroll_reader.num_sample() <= 0 || test_reader.num_sample() <= 0) {
ThrowIo(env, "音频长度无效");
return nullptr;
}

auto speaker_engine = std::make_shared<wespeaker::SpeakerEngine>(
model_path, j_fbank_dim, j_sample_rate,
0 /* embedding size: infer from ONNX output shape */,
-1 /* one embedding for full audio; same as per_chunk_samples_ <= 0 */);
const int embedding_size = speaker_engine->EmbeddingSize();

int16_t* enroll_data = const_cast<int16_t*>(enroll_reader.data());
const int enroll_samples = enroll_reader.num_sample();
int16_t* test_data = const_cast<int16_t*>(test_reader.data());
const int test_samples = test_reader.num_sample();

std::vector<float> enroll_emb(embedding_size, 0.f);
std::vector<float> test_emb(embedding_size, 0.f);
speaker_engine->ExtractEmbedding(enroll_data, enroll_samples, &enroll_emb);
speaker_engine->ExtractEmbedding(test_data, test_samples, &test_emb);

const float score = speaker_engine->CosineSimilarity(enroll_emb, test_emb);
const float same = (score >= threshold) ? 1.f : 0.f;
return MakeFloatArray(env, score, same);
} catch (const std::exception& e) {
ThrowIo(env, e.what());
return nullptr;
} catch (...) {
ThrowIo(env, "native 推理异常");
return nullptr;
}
}
Loading
Loading