diff --git a/pyproject.toml b/pyproject.toml
index 7ba1e0c..55371c4 100644
--- a/pyproject.toml
+++ b/pyproject.toml
@@ -8,11 +8,14 @@ dependencies = [
"agentscope>=1.0.10",
"dashscope>=1.25.5",
"loguru>=0.7.3",
+ "packaging>=25.0",
"pre-commit>=4.3.0",
"pydantic>=2.11.7",
"python-dotenv>=1.2.1",
"pyyaml>=6.0.2",
"ruff>=0.12.11",
+ "torch>=2.9.1",
+ "torchaudio>=2.9.1",
]
[tool.ruff]
diff --git a/silero-vad/CITATION.cff b/silero-vad/CITATION.cff
new file mode 100644
index 0000000..e794c3e
--- /dev/null
+++ b/silero-vad/CITATION.cff
@@ -0,0 +1,20 @@
+cff-version: 1.2.0
+message: "If you use this software, please cite it as below."
+title: "Silero VAD"
+authors:
+ - family-names: "Silero Team"
+ email: "hello@silero.ai"
+type: software
+repository-code: "https://github.com/snakers4/silero-vad"
+license: MIT
+abstract: "Pre-trained enterprise-grade Voice Activity Detector (VAD), Number Detector and Language Classifier"
+preferred-citation:
+ type: software
+ authors:
+ - family-names: "Silero Team"
+ email: "hello@silero.ai"
+ title: "Silero VAD: pre-trained enterprise-grade Voice Activity Detector (VAD), Number Detector and Language Classifier"
+ year: 2024
+ publisher: "GitHub"
+ journal: "GitHub repository"
+ howpublished: "https://github.com/snakers4/silero-vad"
diff --git a/silero-vad/CODE_OF_CONDUCT.md b/silero-vad/CODE_OF_CONDUCT.md
new file mode 100644
index 0000000..c69125e
--- /dev/null
+++ b/silero-vad/CODE_OF_CONDUCT.md
@@ -0,0 +1,76 @@
+# Contributor Covenant Code of Conduct
+
+## Our Pledge
+
+In the interest of fostering an open and welcoming environment, we as
+contributors and maintainers pledge to making participation in our project and
+our community a harassment-free experience for everyone, regardless of age, body
+size, disability, ethnicity, sex characteristics, gender identity and expression,
+level of experience, education, socio-economic status, nationality, personal
+appearance, race, religion, or sexual identity and orientation.
+
+## Our Standards
+
+Examples of behavior that contributes to creating a positive environment
+include:
+
+* Using welcoming and inclusive language
+* Being respectful of differing viewpoints and experiences
+* Gracefully accepting constructive criticism
+* Focusing on what is best for the community
+* Showing empathy towards other community members
+
+Examples of unacceptable behavior by participants include:
+
+* The use of sexualized language or imagery and unwelcome sexual attention or
+ advances
+* Trolling, insulting/derogatory comments, and personal or political attacks
+* Public or private harassment
+* Publishing others' private information, such as a physical or electronic
+ address, without explicit permission
+* Other conduct which could reasonably be considered inappropriate in a
+ professional setting
+
+## Our Responsibilities
+
+Project maintainers are responsible for clarifying the standards of acceptable
+behavior and are expected to take appropriate and fair corrective action in
+response to any instances of unacceptable behavior.
+
+Project maintainers have the right and responsibility to remove, edit, or
+reject comments, commits, code, wiki edits, issues, and other contributions
+that are not aligned to this Code of Conduct, or to ban temporarily or
+permanently any contributor for other behaviors that they deem inappropriate,
+threatening, offensive, or harmful.
+
+## Scope
+
+This Code of Conduct applies both within project spaces and in public spaces
+when an individual is representing the project or its community. Examples of
+representing a project or community include using an official project e-mail
+address, posting via an official social media account, or acting as an appointed
+representative at an online or offline event. Representation of a project may be
+further defined and clarified by project maintainers.
+
+## Enforcement
+
+Instances of abusive, harassing, or otherwise unacceptable behavior may be
+reported by contacting the project team at aveysov@gmail.com. All
+complaints will be reviewed and investigated and will result in a response that
+is deemed necessary and appropriate to the circumstances. The project team is
+obligated to maintain confidentiality with regard to the reporter of an incident.
+Further details of specific enforcement policies may be posted separately.
+
+Project maintainers who do not follow or enforce the Code of Conduct in good
+faith may face temporary or permanent repercussions as determined by other
+members of the project's leadership.
+
+## Attribution
+
+This Code of Conduct is adapted from the [Contributor Covenant][homepage], version 1.4,
+available at https://www.contributor-covenant.org/version/1/4/code-of-conduct.html
+
+[homepage]: https://www.contributor-covenant.org
+
+For answers to common questions about this code of conduct, see
+https://www.contributor-covenant.org/faq
diff --git a/silero-vad/LICENSE b/silero-vad/LICENSE
new file mode 100644
index 0000000..0bf5e90
--- /dev/null
+++ b/silero-vad/LICENSE
@@ -0,0 +1,21 @@
+MIT License
+
+Copyright (c) 2020-present Silero Team
+
+Permission is hereby granted, free of charge, to any person obtaining a copy
+of this software and associated documentation files (the "Software"), to deal
+in the Software without restriction, including without limitation the rights
+to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+copies of the Software, and to permit persons to whom the Software is
+furnished to do so, subject to the following conditions:
+
+The above copyright notice and this permission notice shall be included in all
+copies or substantial portions of the Software.
+
+THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+SOFTWARE.
\ No newline at end of file
diff --git a/silero-vad/README.md b/silero-vad/README.md
new file mode 100644
index 0000000..57f72f9
--- /dev/null
+++ b/silero-vad/README.md
@@ -0,0 +1,178 @@
+[](mailto:hello@silero.ai) [](https://t.me/silero_speech) [](https://github.com/snakers4/silero-vad/blob/master/LICENSE) [](https://pypi.org/project/silero-vad/)
+
+[](https://colab.research.google.com/github/snakers4/silero-vad/blob/master/silero-vad.ipynb) [](https://github.com/snakers4/silero-vad/actions/workflows/test.yml) [](https://pypi.org/project/silero-vad/) [](https://pypi.org/project/silero-vad)
+
+
+
+
+
+
+
+
+Real Time Example
+
+https://user-images.githubusercontent.com/36505480/144874384-95f80f6d-a4f1-42cc-9be7-004c891dd481.mp4
+
+Please note, that video loads only if you are logged in your GitHub account.
+
+
+
+
+
+
Fast start
+
+
+
+Dependencies
+
+ System requirements to run python examples on `x86-64` systems:
+
+ - `python 3.8+`;
+ - 1G+ RAM;
+ - A modern CPU with AVX, AVX2, AVX-512 or AMX instruction sets.
+
+ Dependencies:
+
+ - `torch>=1.12.0`;
+ - `torchaudio>=0.12.0` (for I/O only);
+ - `onnxruntime>=1.16.1` (for ONNX model usage).
+
+ Silero VAD uses torchaudio library for audio I/O (`torchaudio.info`, `torchaudio.load`, and `torchaudio.save`), so a proper audio backend is required:
+
+ - Option №1 - [**FFmpeg**](https://www.ffmpeg.org/) backend. `conda install -c conda-forge 'ffmpeg<7'`;
+ - Option №2 - [**sox_io**](https://pypi.org/project/sox/) backend. `apt-get install sox`, TorchAudio is tested on libsox 14.4.2;
+ - Option №3 - [**soundfile**](https://pypi.org/project/soundfile/) backend. `pip install soundfile`.
+
+If you are planning to run the VAD using solely the `onnx-runtime`, it will run on any other system architectures where onnx-runtume is [supported](https://onnxruntime.ai/getting-started). In this case please note that:
+
+- You will have to implement the I/O;
+- You will have to adapt the existing wrappers / examples / post-processing for your use-case.
+
+
+
+**Using pip**:
+`pip install silero-vad`
+
+```python3
+from silero_vad import load_silero_vad, read_audio, get_speech_timestamps
+model = load_silero_vad()
+wav = read_audio('path_to_audio_file')
+speech_timestamps = get_speech_timestamps(
+ wav,
+ model,
+ return_seconds=True, # Return speech timestamps in seconds (default is samples)
+)
+```
+
+**Using torch.hub**:
+```python3
+import torch
+torch.set_num_threads(1)
+
+model, utils = torch.hub.load(repo_or_dir='snakers4/silero-vad', model='silero_vad')
+(get_speech_timestamps, _, read_audio, _, _) = utils
+
+wav = read_audio('path_to_audio_file')
+speech_timestamps = get_speech_timestamps(
+ wav,
+ model,
+ return_seconds=True, # Return speech timestamps in seconds (default is samples)
+)
+```
+
+
+
+
Key Features
+
+
+- **Stellar accuracy**
+
+ Silero VAD has [excellent results](https://github.com/snakers4/silero-vad/wiki/Quality-Metrics#vs-other-available-solutions) on speech detection tasks.
+
+- **Fast**
+
+ One audio chunk (30+ ms) [takes](https://github.com/snakers4/silero-vad/wiki/Performance-Metrics#silero-vad-performance-metrics) less than **1ms** to be processed on a single CPU thread. Using batching or GPU can also improve performance considerably. Under certain conditions ONNX may even run up to 4-5x faster.
+
+- **Lightweight**
+
+ JIT model is around two megabytes in size.
+
+- **General**
+
+ Silero VAD was trained on huge corpora that include over **6000** languages and it performs well on audios from different domains with various background noise and quality levels.
+
+- **Flexible sampling rate**
+
+ Silero VAD [supports](https://github.com/snakers4/silero-vad/wiki/Quality-Metrics#sample-rate-comparison) **8000 Hz** and **16000 Hz** [sampling rates](https://en.wikipedia.org/wiki/Sampling_(signal_processing)#Sampling_rate).
+
+- **Highly Portable**
+
+ Silero VAD reaps benefits from the rich ecosystems built around **PyTorch** and **ONNX** running everywhere where these runtimes are available.
+
+- **No Strings Attached**
+
+ Published under permissive license (MIT) Silero VAD has zero strings attached - no telemetry, no keys, no registration, no built-in expiration, no keys or vendor lock.
+
+
+
+
Typical Use Cases
+
+
+- Voice activity detection for IOT / edge / mobile use cases
+- Data cleaning and preparation, voice detection in general
+- Telephony and call-center automation, voice bots
+- Voice interfaces
+
+
+
Links
+
+
+
+- [Examples and Dependencies](https://github.com/snakers4/silero-vad/wiki/Examples-and-Dependencies#dependencies)
+- [Quality Metrics](https://github.com/snakers4/silero-vad/wiki/Quality-Metrics)
+- [Performance Metrics](https://github.com/snakers4/silero-vad/wiki/Performance-Metrics)
+- [Versions and Available Models](https://github.com/snakers4/silero-vad/wiki/Version-history-and-Available-Models)
+- [Further reading](https://github.com/snakers4/silero-models#further-reading)
+- [FAQ](https://github.com/snakers4/silero-vad/wiki/FAQ)
+
+
+
Get In Touch
+
+
+Try our models, create an [issue](https://github.com/snakers4/silero-vad/issues/new), start a [discussion](https://github.com/snakers4/silero-vad/discussions/new), join our telegram [chat](https://t.me/silero_speech), [email](mailto:hello@silero.ai) us, read our [news](https://t.me/silero_news).
+
+Please see our [wiki](https://github.com/snakers4/silero-models/wiki) for relevant information and [email](mailto:hello@silero.ai) us directly.
+
+**Citations**
+
+```
+@misc{Silero VAD,
+ author = {Silero Team},
+ title = {Silero VAD: pre-trained enterprise-grade Voice Activity Detector (VAD), Number Detector and Language Classifier},
+ year = {2024},
+ publisher = {GitHub},
+ journal = {GitHub repository},
+ howpublished = {\url{https://github.com/snakers4/silero-vad}},
+ commit = {insert_some_commit_here},
+ email = {hello@silero.ai}
+}
+```
+
+
+
Examples and VAD-based Community Apps
+
+
+- Example of VAD ONNX Runtime model usage in [C++](https://github.com/snakers4/silero-vad/tree/master/examples/cpp)
+
+- Voice activity detection for the [browser](https://github.com/ricky0123/vad) using ONNX Runtime Web
+
+- [Rust](https://github.com/snakers4/silero-vad/tree/master/examples/rust-example), [Go](https://github.com/snakers4/silero-vad/tree/master/examples/go), [Java](https://github.com/snakers4/silero-vad/tree/master/examples/java-example), [C++](https://github.com/snakers4/silero-vad/tree/master/examples/cpp), [C#](https://github.com/snakers4/silero-vad/tree/master/examples/csharp) and [other](https://github.com/snakers4/silero-vad/tree/master/examples) community examples
diff --git a/silero-vad/datasets/README.md b/silero-vad/datasets/README.md
new file mode 100644
index 0000000..7c8231d
--- /dev/null
+++ b/silero-vad/datasets/README.md
@@ -0,0 +1,84 @@
+# Датасет Silero-VAD
+
+> Датасет создан при поддержке Фонда содействия инновациям в рамках федерального проекта «Искусственный
+интеллект» национальной программы «Цифровая экономика Российской Федерации».
+
+По ссылкам ниже представлены `.feather` файлы, содержащие размеченные с помощью Silero VAD открытые наборы аудиоданных, а также короткое описание каждого набора данных с примерами загрузки. `.feather` файлы можно открыть с помощью библиотеки `pandas`:
+```python3
+import pandas as pd
+dataframe = pd.read_feather(PATH_TO_FEATHER_FILE)
+```
+
+Каждый `.feather` файл с разметкой содержит следующие колонки:
+- `speech_timings` - разметка данного аудио. Это список, содержащий словари вида `{'start': START_SECOND, 'end': END_SECOND}`, где `START_SECOND` и `END_SECOND` - время начала и конца речи в секундах. Количество данных словарей равно количеству речевых аудио отрывков, найденных в данном аудио;
+- `language` - ISO код языка данного аудио.
+
+Колонки, содержащие информацию о загрузке аудио файла различаются и описаны для каждого набора данных ниже.
+
+**Все данные размечены при временной дискретизации в ~30 миллисекунд (`num_samples` - 512)**
+
+| Название | Число часов | Число языков | Ссылка | Лицензия | md5sum |
+|----------------------|-------------|-------------|--------|----------|----------|
+| **Bible.is** | 53,138 | 1,596 | [URL](https://live.bible.is/) | [Уникальная](https://live.bible.is/terms) | ea404eeaf2cd283b8223f63002be11f9 |
+| **globalrecordings.net** | 9,743 | 6,171[^1] | [URL](https://globalrecordings.net/en) | CC BY-NC-SA 4.0 | 3c5c0f31b0abd9fe94ddbe8b1e2eb326 |
+| **VoxLingua107** | 6,628 | 107 | [URL](https://bark.phon.ioc.ee/voxlingua107/) | CC BY 4.0 | 5dfef33b4d091b6d399cfaf3d05f2140 |
+| **Common Voice** | 30,329 | 120 | [URL](https://commonvoice.mozilla.org/en/datasets) | CC0 | 5e30a85126adf74a5fd1496e6ac8695d |
+| **MLS** | 50,709 | 8 | [URL](https://www.openslr.org/94/) | CC BY 4.0 | a339d0e94bdf41bba3c003756254ac4e |
+| **Итого** | **150,547** | **6,171+** | | | |
+
+## Bible.is
+
+[Ссылка на `.feather` файл с разметкой](https://models.silero.ai/vad_datasets/BibleIs.feather)
+
+- Колонка `audio_link` содержит ссылки на конкретные аудио файлы.
+
+## globalrecordings.net
+
+[Ссылка на `.feather` файл с разметкой](https://models.silero.ai/vad_datasets/globalrecordings.feather)
+
+- Колонка `folder_link` содержит ссылки на скачивание `.zip` архива для конкретного языка. Внимание! Ссылки на архивы дублируются, т.к каждый архив может содержать множество аудио.
+- Колонка `audio_path` содержит пути до конкретного аудио после распаковки соответствующего архива из колонки `folder_link`
+
+``Количество уникальных ISO кодов данного датасета не совпадает с фактическим количеством представленных языков, т.к некоторые близкие языки могут кодироваться одним и тем же ISO кодом.``
+
+## VoxLingua107
+
+[Ссылка на `.feather` файл с разметкой](https://models.silero.ai/vad_datasets/VoxLingua107.feather)
+
+- Колонка `folder_link` содержит ссылки на скачивание `.zip` архива для конкретного языка. Внимание! Ссылки на архивы дублируются, т.к каждый архив может содержать множество аудио.
+- Колонка `audio_path` содержит пути до конкретного аудио после распаковки соответствующего архива из колонки `folder_link`
+
+## Common Voice
+
+[Ссылка на `.feather` файл с разметкой](https://models.silero.ai/vad_datasets/common_voice.feather)
+
+Этот датасет невозможно скачать по статичным ссылкам. Для загрузки необходимо перейти по [ссылке](https://commonvoice.mozilla.org/en/datasets) и, получив доступ в соответствующей форме, скачать архивы для каждого доступного языка. Внимание! Представленная разметка актуальна для версии исходного датасета `Common Voice Corpus 16.1`.
+
+- Колонка `audio_path` содержит уникальные названия `.mp3` файлов, полученных после скачивания соответствующего датасета.
+
+## MLS
+
+[Ссылка на `.feather` файл с разметкой](https://models.silero.ai/vad_datasets/MLS.feather)
+
+- Колонка `folder_link` содержит ссылки на скачивание `.zip` архива для конкретного языка. Внимание! Ссылки на архивы дублируются, т.к каждый архив может содержать множество аудио.
+- Колонка `audio_path` содержит пути до конкретного аудио после распаковки соответствующего архива из колонки `folder_link`
+
+## Лицензия
+
+Данный датасет распространяется под [лицензией](https://creativecommons.org/licenses/by-nc-sa/4.0/deed.en) `CC BY-NC-SA 4.0`.
+
+## Цитирование
+
+```
+@misc{Silero VAD Dataset,
+ author = {Silero Team},
+ title = {Silero-VAD Dataset: a large public Internet-scale dataset for voice activity detection for 6000+ languages},
+ year = {2024},
+ publisher = {GitHub},
+ journal = {GitHub repository},
+ howpublished = {\url{https://github.com/snakers4/silero-vad/datasets/README.md}},
+ email = {hello@silero.ai}
+}
+```
+
+[^1]: ``Количество уникальных ISO кодов данного датасета не совпадает с фактическим количеством представленных языков, т.к некоторые близкие языки могут кодироваться одним и тем же ISO кодом.``
diff --git a/silero-vad/examples/c++/README.md b/silero-vad/examples/c++/README.md
new file mode 100644
index 0000000..dbd8674
--- /dev/null
+++ b/silero-vad/examples/c++/README.md
@@ -0,0 +1,49 @@
+# Silero-VAD V6 in C++ (based on LibTorch)
+
+This is the source code for Silero-VAD V6 in C++, utilizing LibTorch & Onnxruntime.
+You should compare its results with the Python version.
+Results at 16 and 8kHz have been tested. Batch and CUDA inference options are deprecated.
+
+
+## Requirements
+- GCC 11.4.0 (GCC >= 5.1)
+- Onnxruntime 1.11.0 (other versions are also acceptable)
+- LibTorch 1.13.0 (other versions are also acceptable)
+
+## Download LibTorch
+
+```bash
+-Onnxruntime
+$wget https://github.com/microsoft/onnxruntime/releases/download/v1.11.1/onnxruntime-linux-x64-1.11.1.tgz
+$tar -xvf onnxruntime-linux-x64-1.11.1.tgz
+$ln -s onnxruntime-linux-x64-1.11.1 onnxruntime-linux #soft-link
+
+-Libtorch
+$wget https://download.pytorch.org/libtorch/cpu/libtorch-shared-with-deps-1.13.0%2Bcpu.zip
+$unzip libtorch-shared-with-deps-1.13.0+cpu.zip
+```
+
+## Compilation
+
+```bash
+-ONNX-build
+$g++ main.cc silero.cc -I ./onnxruntime-linux/include/ -L ./onnxruntime-linux/lib/ -lonnxruntime -Wl,-rpath,./onnxruntime-linux/lib/ -o silero -std=c++14 -D_GLIBCXX_USE_CXX11_ABI=0 -DUSE_ONNX
+
+-TORCH-build
+$g++ main.cc silero.cc -I ./libtorch/include/ -I ./libtorch/include/torch/csrc/api/include -L ./libtorch/lib/ -ltorch -ltorch_cpu -lc10 -Wl,-rpath,./libtorch/lib/ -o silero -std=c++14 -D_GLIBCXX_USE_CXX11_ABI=0 -DUSE_TORCH
+```
+
+## Optional Compilation Flags
+-DUSE_TORCH
+-DUSE_ONNX
+
+## Run the Program
+To run the program, use the following command:
+
+`./silero `
+`./silero aepyx.wav 16000 0.5`
+`./silero aepyx_8k.wav 8000 0.5`
+
+The sample file aepyx.wav is part of the Voxconverse dataset.
+File details: aepyx.wav is a 16kHz, 16-bit audio file.
+File details: aepyx_8k.wav is a 8kHz, 16-bit audio file.
diff --git a/silero-vad/examples/c++/aepyx.wav b/silero-vad/examples/c++/aepyx.wav
new file mode 100644
index 0000000..e7cc293
Binary files /dev/null and b/silero-vad/examples/c++/aepyx.wav differ
diff --git a/silero-vad/examples/c++/aepyx_8k.wav b/silero-vad/examples/c++/aepyx_8k.wav
new file mode 100644
index 0000000..53297cd
Binary files /dev/null and b/silero-vad/examples/c++/aepyx_8k.wav differ
diff --git a/silero-vad/examples/c++/main.cc b/silero-vad/examples/c++/main.cc
new file mode 100644
index 0000000..ec6203f
--- /dev/null
+++ b/silero-vad/examples/c++/main.cc
@@ -0,0 +1,61 @@
+#include
+#include "silero.h"
+#include "wav.h"
+
+int main(int argc, char* argv[]) {
+
+ if(argc != 4){
+ std::cerr<<"Usage : "<"< input_wav(wav_reader.num_samples());
+
+ for (int i = 0; i < wav_reader.num_samples(); i++)
+ {
+ input_wav[i] = static_cast(*(wav_reader.data() + i));
+ }
+
+ vad.SpeechProbs(input_wav);
+
+ std::vector speeches = vad.GetSpeechTimestamps();
+ for(const auto& speech : speeches){
+ if(vad.print_as_samples){
+ std::cout<<"{'start': "<(speech.start)<<", 'end': "<(speech.end)<<"}"<& input_wav) {
+ int num_samples = input_wav.size();
+ int num_chunks = num_samples / window_size_samples;
+ int remainder_samples = num_samples % window_size_samples;
+ total_sample_size += num_samples;
+
+ std::vector chunks;
+
+ for (int i = 0; i < num_chunks; i++) {
+ float* chunk_start = input_wav.data() + i * window_size_samples;
+ torch::Tensor chunk = torch::from_blob(chunk_start, {1, window_size_samples}, torch::kFloat32);
+ chunks.push_back(chunk);
+
+ if (i == num_chunks - 1 && remainder_samples > 0) {
+ int remaining_samples = num_samples - num_chunks * window_size_samples;
+ float* chunk_start_remainder = input_wav.data() + num_chunks * window_size_samples;
+ torch::Tensor remainder_chunk = torch::from_blob(chunk_start_remainder, {1, remaining_samples}, torch::kFloat32);
+ torch::Tensor padded_chunk = torch::cat({remainder_chunk, torch::zeros({1, window_size_samples - remaining_samples}, torch::kFloat32)}, 1);
+ chunks.push_back(padded_chunk);
+ }
+ }
+
+ if (!chunks.empty()) {
+ std::vector outputs;
+ torch::Tensor batched_chunks = torch::stack(chunks);
+ for (size_t i = 0; i < chunks.size(); i++) {
+ torch::NoGradGuard no_grad;
+ std::vector inputs;
+ inputs.push_back(batched_chunks[i]);
+ inputs.push_back(sample_rate);
+ torch::Tensor output = model.forward(inputs).toTensor();
+ outputs.push_back(output);
+ }
+ torch::Tensor all_outputs = torch::stack(outputs);
+ for (size_t i = 0; i < chunks.size(); i++) {
+ float output_f = all_outputs[i].item();
+ outputs_prob.push_back(output_f);
+ //////To print Probs by libtorch
+ //std::cout << "Chunk " << i << " prob: " << output_f<< "\n";
+ }
+ }
+ }
+
+
+#elif USE_ONNX
+
+ VadIterator::VadIterator(const std::string &model_path,
+ float threshold,
+ int sample_rate,
+ int window_size_ms,
+ int speech_pad_ms,
+ int min_silence_duration_ms,
+ int min_speech_duration_ms,
+ int max_duration_merge_ms,
+ bool print_as_samples)
+ :sample_rate(sample_rate), threshold(threshold), window_size_ms(window_size_ms),
+ speech_pad_ms(speech_pad_ms), min_silence_duration_ms(min_silence_duration_ms),
+ min_speech_duration_ms(min_speech_duration_ms), max_duration_merge_ms(max_duration_merge_ms),
+ print_as_samples(print_as_samples),
+ env(ORT_LOGGING_LEVEL_ERROR, "Vad"), session_options(), session(nullptr), allocator(),
+ memory_info(Ort::MemoryInfo::CreateCpu(OrtArenaAllocator, OrtMemTypeCPU)), context_samples(64),
+ _context(64, 0.0f), current_sample(0), size_state(2 * 1 * 128),
+ input_node_names({"input", "state", "sr"}), output_node_names({"output", "stateN"}),
+ state_node_dims{2, 1, 128}, sr_node_dims{1}
+
+ {
+ init_onnx_model(model_path);
+ }
+ VadIterator::~VadIterator(){
+ }
+
+ void VadIterator::init_onnx_model(const std::string& model_path) {
+ int inter_threads=1;
+ int intra_threads=1;
+ session_options.SetIntraOpNumThreads(intra_threads);
+ session_options.SetInterOpNumThreads(inter_threads);
+ session_options.SetGraphOptimizationLevel(GraphOptimizationLevel::ORT_ENABLE_ALL);
+ session = std::make_shared(env, model_path.c_str(), session_options);
+ std::cout<<"Silero onnx-Model loaded successfully"<& data_chunk) {
+ // _context와 현재 청크를 결합하여 입력 데이터 구성
+ std::vector new_data(effective_window_size, 0.0f);
+ std::copy(_context.begin(), _context.end(), new_data.begin());
+ std::copy(data_chunk.begin(), data_chunk.end(), new_data.begin() + context_samples);
+ input = new_data;
+
+ Ort::Value input_ort = Ort::Value::CreateTensor(
+ memory_info, input.data(), input.size(), input_node_dims, 2);
+ Ort::Value state_ort = Ort::Value::CreateTensor(
+ memory_info, _state.data(), _state.size(), state_node_dims, 3);
+ Ort::Value sr_ort = Ort::Value::CreateTensor(
+ memory_info, sr.data(), sr.size(), sr_node_dims, 1);
+ ort_inputs.clear();
+ ort_inputs.push_back(std::move(input_ort));
+ ort_inputs.push_back(std::move(state_ort));
+ ort_inputs.push_back(std::move(sr_ort));
+
+ ort_outputs = session->Run(
+ Ort::RunOptions{ nullptr },
+ input_node_names.data(), ort_inputs.data(), ort_inputs.size(),
+ output_node_names.data(), output_node_names.size());
+
+ float speech_prob = ort_outputs[0].GetTensorMutableData()[0]; // ONNX 출력: 첫 번째 값이 음성 확률
+
+ float* stateN = ort_outputs[1].GetTensorMutableData(); // 두 번째 출력값: 상태 업데이트
+ std::memcpy(_state.data(), stateN, size_state * sizeof(float));
+
+ std::copy(new_data.end() - context_samples, new_data.end(), _context.begin());
+ // _context 업데이트: new_data의 마지막 context_samples 유지
+
+ return speech_prob;
+ }
+ void VadIterator::SpeechProbs(std::vector& input_wav) {
+ reset_states();
+ total_sample_size = static_cast(input_wav.size());
+ for (size_t j = 0; j < static_cast(total_sample_size); j += window_size_samples) {
+ if (j + window_size_samples > static_cast(total_sample_size))
+ break;
+ std::vector chunk(input_wav.begin() + j, input_wav.begin() + j + window_size_samples);
+ float speech_prob = predict(chunk);
+ outputs_prob.push_back(speech_prob);
+ }
+ }
+
+#endif
+
+ void VadIterator::reset_states() {
+ triggered = false;
+ current_sample = 0;
+ temp_end = 0;
+ outputs_prob.clear();
+ total_sample_size = 0;
+
+#ifdef USE_TORCH
+ model.run_method("reset_states"); // Reset model states if applicable
+#elif USE_ONNX
+ std::memset(_state.data(), 0, _state.size() * sizeof(float));
+ std::fill(_context.begin(), _context.end(), 0.0f);
+#endif
+ }
+
+ std::vector VadIterator::GetSpeechTimestamps() {
+ std::vector speeches = DoVad();
+ if(!print_as_samples){
+ for (auto& speech : speeches) {
+ speech.start /= sample_rate;
+ speech.end /= sample_rate;
+ }
+ }
+ return speeches;
+ }
+
+ void VadIterator::SetVariables(){
+ // Initialize internal engine parameters
+ init_engine(window_size_ms);
+ }
+
+ void VadIterator::init_engine(int window_size_ms) {
+ min_silence_samples = sample_rate * min_silence_duration_ms / 1000;
+ speech_pad_samples = sample_rate * speech_pad_ms / 1000;
+ window_size_samples = sample_rate / 1000 * window_size_ms;
+ min_speech_samples = sample_rate * min_speech_duration_ms / 1000;
+#ifdef USE_ONNX
+ //for ONNX
+ context_samples=window_size_samples / 8;
+ _context.assign(context_samples, 0.0f);
+
+ effective_window_size = window_size_samples + context_samples; // 예: 512 + 64 = 576 samples
+ input_node_dims[0] = 1;
+ input_node_dims[1] = effective_window_size;
+ _state.resize(size_state);
+ sr.resize(1);
+ sr[0] = sample_rate;
+#endif
+ }
+
+ std::vector VadIterator::DoVad() {
+ std::vector speeches;
+ for (size_t i = 0; i < outputs_prob.size(); ++i) {
+ float speech_prob = outputs_prob[i];
+ current_sample += window_size_samples;
+ if (speech_prob >= threshold && temp_end != 0) {
+ temp_end = 0;
+ }
+
+ if (speech_prob >= threshold) {
+ if (!triggered) {
+ triggered = true;
+ Interval segment;
+ segment.start = std::max(0, current_sample - speech_pad_samples - window_size_samples);
+ speeches.push_back(segment);
+ }
+ }else {
+ if (triggered) {
+ if (speech_prob < threshold - 0.15f) {
+ if (temp_end == 0) {
+ temp_end = current_sample;
+ }
+ if (current_sample - temp_end >= min_silence_samples) {
+ Interval& segment = speeches.back();
+ segment.end = temp_end + speech_pad_samples - window_size_samples;
+ temp_end = 0;
+ triggered = false;
+ }
+ }
+ }
+ }
+
+
+ }
+
+ if (triggered) {
+ std::cout<<"Finalizing active speech segment at stream end."<speech_pad_samples) - (speech.start + this->speech_pad_samples) < min_speech_samples);
+ }), speeches.end());
+
+ reset_states();
+ return speeches;
+ }
+
+
+ } // namespace silero
+
diff --git a/silero-vad/examples/c++/silero.h b/silero-vad/examples/c++/silero.h
new file mode 100644
index 0000000..8d4a11e
--- /dev/null
+++ b/silero-vad/examples/c++/silero.h
@@ -0,0 +1,123 @@
+#ifndef SILERO_H
+#define SILERO_H
+
+// silero.h
+// Author : NathanJHLee
+// Created On : 2025-11-10
+// Description : silero 6.2 system for onnx-runtime(c++) and torch-script(c++)
+// Version : 1.3
+
+#include
+#include
+#include
+#include
+#include
+#include
+#include
+
+#ifdef USE_TORCH
+#include
+#include
+#elif USE_ONNX
+#include "onnxruntime_cxx_api.h"
+#endif
+
+namespace silero {
+
+ struct Interval {
+ float start;
+ float end;
+ int numberOfSubseg;
+
+ void initialize() {
+ start = 0;
+ end = 0;
+ numberOfSubseg = 0;
+ }
+ };
+
+ class VadIterator {
+ public:
+ VadIterator(const std::string &model_path,
+ float threshold = 0.5,
+ int sample_rate = 16000,
+ int window_size_ms = 32,
+ int speech_pad_ms = 30,
+ int min_silence_duration_ms = 100,
+ int min_speech_duration_ms = 250,
+ int max_duration_merge_ms = 300,
+ bool print_as_samples = false);
+ ~VadIterator();
+
+ // Batch (non-streaming) interface (for backward compatibility)
+ void SpeechProbs(std::vector& input_wav);
+ std::vector GetSpeechTimestamps();
+ void SetVariables();
+
+ // Public parameters (can be modified by user)
+ float threshold;
+ int sample_rate;
+ int window_size_ms;
+ int min_speech_duration_ms;
+ int max_duration_merge_ms;
+ bool print_as_samples;
+
+ private:
+#ifdef USE_TORCH
+ torch::jit::script::Module model;
+ void init_torch_model(const std::string& model_path);
+#elif USE_ONNX
+ Ort::Env env; // 환경 객체
+ Ort::SessionOptions session_options; // 세션 옵션
+ std::shared_ptr session; // ONNX 세션
+ Ort::AllocatorWithDefaultOptions allocator; // 기본 할당자
+ Ort::MemoryInfo memory_info; // 메모리 정보 (CPU)
+
+ void init_onnx_model(const std::string& model_path);
+ float predict(const std::vector& data_chunk);
+
+ //const int context_samples; // 예: 64 samples
+ int context_samples; // 예: 64 samples
+ std::vector _context; // 초기값 모두 0
+ int effective_window_size;
+
+ // ONNX 입력/출력 관련 버퍼 및 노드 이름들
+ std::vector ort_inputs;
+ std::vector input_node_names;
+ std::vector input;
+ unsigned int size_state; // 고정값: 2*1*128
+ std::vector _state;
+ std::vector sr;
+ int64_t input_node_dims[2]; // [1, effective_window_size]
+ const int64_t state_node_dims[3]; // [ 2, 1, 128 ]
+ const int64_t sr_node_dims[1]; // [ 1 ]
+ std::vector ort_outputs;
+ std::vector output_node_names; // 기본값: [ "output", "stateN" ]
+#endif
+ std::vector outputs_prob; // used in batch mode
+ int min_silence_samples;
+ int min_speech_samples;
+ int speech_pad_samples;
+ int window_size_samples;
+ int duration_merge_samples;
+ int current_sample = 0;
+ int total_sample_size = 0;
+ int min_silence_duration_ms;
+ int speech_pad_ms;
+ bool triggered = false;
+ int temp_end = 0;
+ int global_end = 0;
+ int erase_tail_count = 0;
+
+
+ void init_engine(int window_size_ms);
+ void reset_states();
+ std::vector DoVad();
+
+
+ };
+
+} // namespace silero
+
+#endif // SILERO_H
+
diff --git a/silero-vad/examples/c++/wav.h b/silero-vad/examples/c++/wav.h
new file mode 100644
index 0000000..d567ee6
--- /dev/null
+++ b/silero-vad/examples/c++/wav.h
@@ -0,0 +1,237 @@
+// Copyright (c) 2016 Personal (Binbin Zhang)
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+// http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+
+#ifndef FRONTEND_WAV_H_
+#define FRONTEND_WAV_H_
+
+#include
+#include
+#include
+#include
+#include
+
+#include
+
+// #include "utils/log.h"
+
+namespace wav {
+
+struct WavHeader {
+ char riff[4]; // "riff"
+ unsigned int size;
+ char wav[4]; // "WAVE"
+ char fmt[4]; // "fmt "
+ unsigned int fmt_size;
+ uint16_t format;
+ uint16_t channels;
+ unsigned int sample_rate;
+ unsigned int bytes_per_second;
+ uint16_t block_size;
+ uint16_t bit;
+ char data[4]; // "data"
+ unsigned int data_size;
+};
+
+class WavReader {
+ public:
+ WavReader() : data_(nullptr) {}
+ explicit WavReader(const std::string& filename) { Open(filename); }
+
+ bool Open(const std::string& filename) {
+ FILE* fp = fopen(filename.c_str(), "rb"); //文件读取
+ if (NULL == fp) {
+ std::cout << "Error in read " << filename;
+ return false;
+ }
+
+ WavHeader header;
+ fread(&header, 1, sizeof(header), fp);
+ if (header.fmt_size < 16) {
+ printf("WaveData: expect PCM format data "
+ "to have fmt chunk of at least size 16.\n");
+ return false;
+ } else if (header.fmt_size > 16) {
+ int offset = 44 - 8 + header.fmt_size - 16;
+ fseek(fp, offset, SEEK_SET);
+ fread(header.data, 8, sizeof(char), fp);
+ }
+ // check "riff" "WAVE" "fmt " "data"
+
+ // Skip any sub-chunks between "fmt" and "data". Usually there will
+ // be a single "fact" sub chunk, but on Windows there can also be a
+ // "list" sub chunk.
+ while (0 != strncmp(header.data, "data", 4)) {
+ // We will just ignore the data in these chunks.
+ fseek(fp, header.data_size, SEEK_CUR);
+ // read next sub chunk
+ fread(header.data, 8, sizeof(char), fp);
+ }
+
+ if (header.data_size == 0) {
+ int offset = ftell(fp);
+ fseek(fp, 0, SEEK_END);
+ header.data_size = ftell(fp) - offset;
+ fseek(fp, offset, SEEK_SET);
+ }
+
+ num_channel_ = header.channels;
+ sample_rate_ = header.sample_rate;
+ bits_per_sample_ = header.bit;
+ int num_data = header.data_size / (bits_per_sample_ / 8);
+ data_ = new float[num_data]; // Create 1-dim array
+ num_samples_ = num_data / num_channel_;
+
+ std::cout << "num_channel_ :" << num_channel_ << std::endl;
+ std::cout << "sample_rate_ :" << sample_rate_ << std::endl;
+ std::cout << "bits_per_sample_:" << bits_per_sample_ << std::endl;
+ std::cout << "num_samples :" << num_data << std::endl;
+ std::cout << "num_data_size :" << header.data_size << std::endl;
+
+ switch (bits_per_sample_) {
+ case 8: {
+ char sample;
+ for (int i = 0; i < num_data; ++i) {
+ fread(&sample, 1, sizeof(char), fp);
+ data_[i] = static_cast(sample) / 32768;
+ }
+ break;
+ }
+ case 16: {
+ int16_t sample;
+ for (int i = 0; i < num_data; ++i) {
+ fread(&sample, 1, sizeof(int16_t), fp);
+ data_[i] = static_cast(sample) / 32768;
+ }
+ break;
+ }
+ case 32:
+ {
+ if (header.format == 1) //S32
+ {
+ int sample;
+ for (int i = 0; i < num_data; ++i) {
+ fread(&sample, 1, sizeof(int), fp);
+ data_[i] = static_cast(sample) / 32768;
+ }
+ }
+ else if (header.format == 3) // IEEE-float
+ {
+ float sample;
+ for (int i = 0; i < num_data; ++i) {
+ fread(&sample, 1, sizeof(float), fp);
+ data_[i] = static_cast(sample);
+ }
+ }
+ else {
+ printf("unsupported quantization bits\n");
+ }
+ break;
+ }
+ default:
+ printf("unsupported quantization bits\n");
+ break;
+ }
+
+ fclose(fp);
+ return true;
+ }
+
+ int num_channel() const { return num_channel_; }
+ int sample_rate() const { return sample_rate_; }
+ int bits_per_sample() const { return bits_per_sample_; }
+ int num_samples() const { return num_samples_; }
+
+ ~WavReader() {
+ delete[] data_;
+ }
+
+ const float* data() const { return data_; }
+
+ private:
+ int num_channel_;
+ int sample_rate_;
+ int bits_per_sample_;
+ int num_samples_; // sample points per channel
+ float* data_;
+};
+
+class WavWriter {
+ public:
+ WavWriter(const float* data, int num_samples, int num_channel,
+ int sample_rate, int bits_per_sample)
+ : data_(data),
+ num_samples_(num_samples),
+ num_channel_(num_channel),
+ sample_rate_(sample_rate),
+ bits_per_sample_(bits_per_sample) {}
+
+ void Write(const std::string& filename) {
+ FILE* fp = fopen(filename.c_str(), "w");
+ // init char 'riff' 'WAVE' 'fmt ' 'data'
+ WavHeader header;
+ char wav_header[44] = {0x52, 0x49, 0x46, 0x46, 0x00, 0x00, 0x00, 0x00, 0x57,
+ 0x41, 0x56, 0x45, 0x66, 0x6d, 0x74, 0x20, 0x10, 0x00,
+ 0x00, 0x00, 0x01, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
+ 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
+ 0x64, 0x61, 0x74, 0x61, 0x00, 0x00, 0x00, 0x00};
+ memcpy(&header, wav_header, sizeof(header));
+ header.channels = num_channel_;
+ header.bit = bits_per_sample_;
+ header.sample_rate = sample_rate_;
+ header.data_size = num_samples_ * num_channel_ * (bits_per_sample_ / 8);
+ header.size = sizeof(header) - 8 + header.data_size;
+ header.bytes_per_second =
+ sample_rate_ * num_channel_ * (bits_per_sample_ / 8);
+ header.block_size = num_channel_ * (bits_per_sample_ / 8);
+
+ fwrite(&header, 1, sizeof(header), fp);
+
+ for (int i = 0; i < num_samples_; ++i) {
+ for (int j = 0; j < num_channel_; ++j) {
+ switch (bits_per_sample_) {
+ case 8: {
+ char sample = static_cast(data_[i * num_channel_ + j]);
+ fwrite(&sample, 1, sizeof(sample), fp);
+ break;
+ }
+ case 16: {
+ int16_t sample = static_cast(data_[i * num_channel_ + j]);
+ fwrite(&sample, 1, sizeof(sample), fp);
+ break;
+ }
+ case 32: {
+ int sample = static_cast(data_[i * num_channel_ + j]);
+ fwrite(&sample, 1, sizeof(sample), fp);
+ break;
+ }
+ }
+ }
+ }
+ fclose(fp);
+ }
+
+ private:
+ const float* data_;
+ int num_samples_; // total float points in data_
+ int num_channel_;
+ int sample_rate_;
+ int bits_per_sample_;
+};
+
+} // namespace wav
+
+#endif // FRONTEND_WAV_H_
+
+
diff --git a/silero-vad/examples/colab_record_example.ipynb b/silero-vad/examples/colab_record_example.ipynb
new file mode 100644
index 0000000..4de7e26
--- /dev/null
+++ b/silero-vad/examples/colab_record_example.ipynb
@@ -0,0 +1,237 @@
+{
+ "cells": [
+ {
+ "cell_type": "markdown",
+ "metadata": {
+ "id": "bccAucKjnPHm"
+ },
+ "source": [
+ "### Dependencies and inputs"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": null,
+ "metadata": {
+ "id": "cSih95WFmwgi"
+ },
+ "outputs": [],
+ "source": [
+ "#!apt install ffmpeg\n",
+ "!pip -q install pydub\n",
+ "from google.colab import output\n",
+ "from base64 import b64decode, b64encode\n",
+ "from io import BytesIO\n",
+ "import numpy as np\n",
+ "from pydub import AudioSegment\n",
+ "from IPython.display import HTML, display\n",
+ "import torch\n",
+ "import matplotlib.pyplot as plt\n",
+ "import moviepy.editor as mpe\n",
+ "from matplotlib.animation import FuncAnimation, FFMpegWriter\n",
+ "import matplotlib\n",
+ "matplotlib.use('Agg')\n",
+ "\n",
+ "torch.set_num_threads(1)\n",
+ "\n",
+ "model, _ = torch.hub.load(repo_or_dir='snakers4/silero-vad',\n",
+ " model='silero_vad',\n",
+ " force_reload=True)\n",
+ "\n",
+ "def int2float(audio):\n",
+ " samples = audio.get_array_of_samples()\n",
+ " new_sound = audio._spawn(samples)\n",
+ " arr = np.array(samples).astype(np.float32)\n",
+ " arr = arr / np.abs(arr).max()\n",
+ " return arr\n",
+ "\n",
+ "AUDIO_HTML = \"\"\"\n",
+ "\n",
+ "\"\"\"\n",
+ "\n",
+ "def record(sec=10):\n",
+ " display(HTML(AUDIO_HTML))\n",
+ " s = output.eval_js(\"data\")\n",
+ " b = b64decode(s.split(',')[1])\n",
+ " audio = AudioSegment.from_file(BytesIO(b))\n",
+ " audio.export('test.mp3', format='mp3')\n",
+ " audio = audio.set_channels(1)\n",
+ " audio = audio.set_frame_rate(16000)\n",
+ " audio_float = int2float(audio)\n",
+ " audio_tens = torch.tensor(audio_float)\n",
+ " return audio_tens\n",
+ "\n",
+ "def make_animation(probs, audio_duration, interval=40):\n",
+ " fig = plt.figure(figsize=(16, 9))\n",
+ " ax = plt.axes(xlim=(0, audio_duration), ylim=(0, 1.02))\n",
+ " line, = ax.plot([], [], lw=2)\n",
+ " x = [i / 16000 * 512 for i in range(len(probs))]\n",
+ " plt.xlabel('Time, seconds', fontsize=16)\n",
+ " plt.ylabel('Speech Probability', fontsize=16)\n",
+ "\n",
+ " def init():\n",
+ " plt.fill_between(x, probs, color='#064273')\n",
+ " line.set_data([], [])\n",
+ " line.set_color('#990000')\n",
+ " return line,\n",
+ "\n",
+ " def animate(i):\n",
+ " x = i * interval / 1000 - 0.04\n",
+ " y = np.linspace(0, 1.02, 2)\n",
+ "\n",
+ " line.set_data(x, y)\n",
+ " line.set_color('#990000')\n",
+ " return line,\n",
+ " anim = FuncAnimation(fig, animate, init_func=init, interval=interval, save_count=int(audio_duration / (interval / 1000)))\n",
+ "\n",
+ " f = r\"animation.mp4\"\n",
+ " writervideo = FFMpegWriter(fps=1000/interval)\n",
+ " anim.save(f, writer=writervideo)\n",
+ " plt.close('all')\n",
+ "\n",
+ "def combine_audio(vidname, audname, outname, fps=25):\n",
+ " my_clip = mpe.VideoFileClip(vidname, verbose=False)\n",
+ " audio_background = mpe.AudioFileClip(audname)\n",
+ " final_clip = my_clip.set_audio(audio_background)\n",
+ " final_clip.write_videofile(outname,fps=fps,verbose=False)\n",
+ "\n",
+ "def record_make_animation():\n",
+ " tensor = record()\n",
+ " print('Calculating probabilities...')\n",
+ " speech_probs = []\n",
+ " window_size_samples = 512\n",
+ " speech_probs = model.audio_forward(tensor, sr=16000)[0].tolist()\n",
+ " model.reset_states()\n",
+ " print('Making animation...')\n",
+ " make_animation(speech_probs, len(tensor) / 16000)\n",
+ "\n",
+ " print('Merging your voice with animation...')\n",
+ " combine_audio('animation.mp4', 'test.mp3', 'merged.mp4')\n",
+ " print('Done!')\n",
+ " mp4 = open('merged.mp4','rb').read()\n",
+ " data_url = \"data:video/mp4;base64,\" + b64encode(mp4).decode()\n",
+ " display(HTML(\"\"\"\n",
+ " \n",
+ " \"\"\" % data_url))\n",
+ "\n",
+ " return speech_probs"
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "metadata": {
+ "id": "IFVs3GvTnpB1"
+ },
+ "source": [
+ "## Record example"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": null,
+ "metadata": {
+ "id": "5EBjrTwiqAaQ"
+ },
+ "outputs": [],
+ "source": [
+ "speech_probs = record_make_animation()"
+ ]
+ }
+ ],
+ "metadata": {
+ "colab": {
+ "collapsed_sections": [
+ "bccAucKjnPHm"
+ ],
+ "name": "Untitled2.ipynb",
+ "provenance": []
+ },
+ "kernelspec": {
+ "display_name": "Python 3",
+ "name": "python3"
+ },
+ "language_info": {
+ "name": "python"
+ }
+ },
+ "nbformat": 4,
+ "nbformat_minor": 0
+}
diff --git a/silero-vad/examples/cpp/README.md b/silero-vad/examples/cpp/README.md
new file mode 100644
index 0000000..93a6791
--- /dev/null
+++ b/silero-vad/examples/cpp/README.md
@@ -0,0 +1,43 @@
+# Stream example in C++
+
+Here's a simple example of the vad model in c++ onnxruntime.
+
+
+
+## Requirements
+
+Code are tested in the environments bellow, feel free to try others.
+
+- WSL2 + Debian-bullseye (docker)
+- gcc 12.2.0
+- onnxruntime-linux-x64-1.12.1
+
+
+
+## Usage
+
+1. Install gcc 12.2.0, or just pull the docker image with `docker pull gcc:12.2.0-bullseye`
+
+2. Install onnxruntime-linux-x64-1.12.1
+
+ - Download lib onnxruntime:
+
+ `wget https://github.com/microsoft/onnxruntime/releases/download/v1.12.1/onnxruntime-linux-x64-1.12.1.tgz`
+
+ - Unzip. Assume the path is `/root/onnxruntime-linux-x64-1.12.1`
+
+3. Modify wav path & Test configs in main function
+
+ `wav::WavReader wav_reader("${path_to_your_wav_file}");`
+
+ test sample rate, frame per ms, threshold...
+
+4. Build with gcc and run
+
+ ```bash
+ # Build
+ g++ silero-vad-onnx.cpp -I /root/onnxruntime-linux-x64-1.12.1/include/ -L /root/onnxruntime-linux-x64-1.12.1/lib/ -lonnxruntime -Wl,-rpath,/root/onnxruntime-linux-x64-1.12.1/lib/ -o test
+
+ # Run
+ ./test
+ ```
\ No newline at end of file
diff --git a/silero-vad/examples/cpp/silero-vad-onnx.cpp b/silero-vad/examples/cpp/silero-vad-onnx.cpp
new file mode 100644
index 0000000..380d76d
--- /dev/null
+++ b/silero-vad/examples/cpp/silero-vad-onnx.cpp
@@ -0,0 +1,367 @@
+#ifndef _CRT_SECURE_NO_WARNINGS
+#define _CRT_SECURE_NO_WARNINGS
+#endif
+
+#include
+#include
+#include
+#include
+#include
+#include
+#include
+#include
+#include
+#include
+#include
+#include
+#include // for std::rint
+#if __cplusplus < 201703L
+#include
+#endif
+
+//#define __DEBUG_SPEECH_PROB___
+
+#include "onnxruntime_cxx_api.h"
+#include "wav.h" // For reading WAV files
+
+// timestamp_t class: stores the start and end (in samples) of a speech segment.
+class timestamp_t {
+public:
+ int start;
+ int end;
+
+ timestamp_t(int start = -1, int end = -1)
+ : start(start), end(end) { }
+
+ timestamp_t& operator=(const timestamp_t& a) {
+ start = a.start;
+ end = a.end;
+ return *this;
+ }
+
+ bool operator==(const timestamp_t& a) const {
+ return (start == a.start && end == a.end);
+ }
+
+ // Returns a formatted string of the timestamp.
+ std::string c_str() const {
+ return format("{start:%08d, end:%08d}", start, end);
+ }
+private:
+ // Helper function for formatting.
+ std::string format(const char* fmt, ...) const {
+ char buf[256];
+ va_list args;
+ va_start(args, fmt);
+ const auto r = std::vsnprintf(buf, sizeof(buf), fmt, args);
+ va_end(args);
+ if (r < 0)
+ return {};
+ const size_t len = r;
+ if (len < sizeof(buf))
+ return std::string(buf, len);
+#if __cplusplus >= 201703L
+ std::string s(len, '\0');
+ va_start(args, fmt);
+ std::vsnprintf(s.data(), len + 1, fmt, args);
+ va_end(args);
+ return s;
+#else
+ auto vbuf = std::unique_ptr(new char[len + 1]);
+ va_start(args, fmt);
+ std::vsnprintf(vbuf.get(), len + 1, fmt, args);
+ va_end(args);
+ return std::string(vbuf.get(), len);
+#endif
+ }
+};
+
+// VadIterator class: uses ONNX Runtime to detect speech segments.
+class VadIterator {
+private:
+ // ONNX Runtime resources
+ Ort::Env env;
+ Ort::SessionOptions session_options;
+ std::shared_ptr session = nullptr;
+ Ort::AllocatorWithDefaultOptions allocator;
+ Ort::MemoryInfo memory_info = Ort::MemoryInfo::CreateCpu(OrtArenaAllocator, OrtMemTypeCPU);
+
+ // ----- Context-related additions -----
+ const int context_samples = 64; // For 16kHz, 64 samples are added as context.
+ std::vector _context; // Holds the last 64 samples from the previous chunk (initialized to zero).
+
+ // Original window size (e.g., 32ms corresponds to 512 samples)
+ int window_size_samples;
+ // Effective window size = window_size_samples + context_samples
+ int effective_window_size;
+
+ // Additional declaration: samples per millisecond
+ int sr_per_ms;
+
+ // ONNX Runtime input/output buffers
+ std::vector ort_inputs;
+ std::vector input_node_names = { "input", "state", "sr" };
+ std::vector input;
+ unsigned int size_state = 2 * 1 * 128;
+ std::vector _state;
+ std::vector sr;
+ int64_t input_node_dims[2] = {};
+ const int64_t state_node_dims[3] = { 2, 1, 128 };
+ const int64_t sr_node_dims[1] = { 1 };
+ std::vector ort_outputs;
+ std::vector output_node_names = { "output", "stateN" };
+
+ // Model configuration parameters
+ int sample_rate;
+ float threshold;
+ int min_silence_samples;
+ int min_silence_samples_at_max_speech;
+ int min_speech_samples;
+ float max_speech_samples;
+ int speech_pad_samples;
+ int audio_length_samples;
+
+ // State management
+ bool triggered = false;
+ unsigned int temp_end = 0;
+ unsigned int current_sample = 0;
+ int prev_end;
+ int next_start = 0;
+ std::vector speeches;
+ timestamp_t current_speech;
+
+ // Loads the ONNX model.
+ void init_onnx_model(const std::wstring& model_path) {
+ init_engine_threads(1, 1);
+ session = std::make_shared(env, model_path.c_str(), session_options);
+ }
+
+ // Initializes threading settings.
+ void init_engine_threads(int inter_threads, int intra_threads) {
+ session_options.SetIntraOpNumThreads(intra_threads);
+ session_options.SetInterOpNumThreads(inter_threads);
+ session_options.SetGraphOptimizationLevel(GraphOptimizationLevel::ORT_ENABLE_ALL);
+ }
+
+ // Resets internal state (_state, _context, etc.)
+ void reset_states() {
+ std::memset(_state.data(), 0, _state.size() * sizeof(float));
+ triggered = false;
+ temp_end = 0;
+ current_sample = 0;
+ prev_end = next_start = 0;
+ speeches.clear();
+ current_speech = timestamp_t();
+ std::fill(_context.begin(), _context.end(), 0.0f);
+ }
+
+ // Inference: runs inference on one chunk of input data.
+ // data_chunk is expected to have window_size_samples samples.
+ void predict(const std::vector& data_chunk) {
+ // Build new input: first context_samples from _context, followed by the current chunk (window_size_samples).
+ std::vector new_data(effective_window_size, 0.0f);
+ std::copy(_context.begin(), _context.end(), new_data.begin());
+ std::copy(data_chunk.begin(), data_chunk.end(), new_data.begin() + context_samples);
+ input = new_data;
+
+ // Create input tensor (input_node_dims[1] is already set to effective_window_size).
+ Ort::Value input_ort = Ort::Value::CreateTensor(
+ memory_info, input.data(), input.size(), input_node_dims, 2);
+ Ort::Value state_ort = Ort::Value::CreateTensor(
+ memory_info, _state.data(), _state.size(), state_node_dims, 3);
+ Ort::Value sr_ort = Ort::Value::CreateTensor(
+ memory_info, sr.data(), sr.size(), sr_node_dims, 1);
+ ort_inputs.clear();
+ ort_inputs.emplace_back(std::move(input_ort));
+ ort_inputs.emplace_back(std::move(state_ort));
+ ort_inputs.emplace_back(std::move(sr_ort));
+
+ // Run inference.
+ ort_outputs = session->Run(
+ Ort::RunOptions{ nullptr },
+ input_node_names.data(), ort_inputs.data(), ort_inputs.size(),
+ output_node_names.data(), output_node_names.size());
+
+ float speech_prob = ort_outputs[0].GetTensorMutableData()[0];
+ float* stateN = ort_outputs[1].GetTensorMutableData();
+ std::memcpy(_state.data(), stateN, size_state * sizeof(float));
+ current_sample += static_cast(window_size_samples); // Advance by the original window size.
+
+ // If speech is detected (probability >= threshold)
+ if (speech_prob >= threshold) {
+#ifdef __DEBUG_SPEECH_PROB___
+ float speech = current_sample - window_size_samples;
+ printf("{ start: %.3f s (%.3f) %08d}\n", 1.0f * speech / sample_rate, speech_prob, current_sample - window_size_samples);
+#endif
+ if (temp_end != 0) {
+ temp_end = 0;
+ if (next_start < prev_end)
+ next_start = current_sample - window_size_samples;
+ }
+ if (!triggered) {
+ triggered = true;
+ current_speech.start = current_sample - window_size_samples;
+ }
+ // Update context: copy the last context_samples from new_data.
+ std::copy(new_data.end() - context_samples, new_data.end(), _context.begin());
+ return;
+ }
+
+ // If the speech segment becomes too long.
+ if (triggered && ((current_sample - current_speech.start) > max_speech_samples)) {
+ if (prev_end > 0) {
+ current_speech.end = prev_end;
+ speeches.push_back(current_speech);
+ current_speech = timestamp_t();
+ if (next_start < prev_end)
+ triggered = false;
+ else
+ current_speech.start = next_start;
+ prev_end = 0;
+ next_start = 0;
+ temp_end = 0;
+ }
+ else {
+ current_speech.end = current_sample;
+ speeches.push_back(current_speech);
+ current_speech = timestamp_t();
+ prev_end = 0;
+ next_start = 0;
+ temp_end = 0;
+ triggered = false;
+ }
+ std::copy(new_data.end() - context_samples, new_data.end(), _context.begin());
+ return;
+ }
+
+ if ((speech_prob >= (threshold - 0.15)) && (speech_prob < threshold)) {
+ // When the speech probability temporarily drops but is still in speech, update context without changing state.
+ std::copy(new_data.end() - context_samples, new_data.end(), _context.begin());
+ return;
+ }
+
+ if (speech_prob < (threshold - 0.15)) {
+#ifdef __DEBUG_SPEECH_PROB___
+ float speech = current_sample - window_size_samples - speech_pad_samples;
+ printf("{ end: %.3f s (%.3f) %08d}\n", 1.0f * speech / sample_rate, speech_prob, current_sample - window_size_samples);
+#endif
+ if (triggered) {
+ if (temp_end == 0)
+ temp_end = current_sample;
+ if (current_sample - temp_end > min_silence_samples_at_max_speech)
+ prev_end = temp_end;
+ if ((current_sample - temp_end) >= min_silence_samples) {
+ current_speech.end = temp_end;
+ if (current_speech.end - current_speech.start > min_speech_samples) {
+ speeches.push_back(current_speech);
+ current_speech = timestamp_t();
+ prev_end = 0;
+ next_start = 0;
+ temp_end = 0;
+ triggered = false;
+ }
+ }
+ }
+ std::copy(new_data.end() - context_samples, new_data.end(), _context.begin());
+ return;
+ }
+ }
+
+public:
+ // Process the entire audio input.
+ void process(const std::vector& input_wav) {
+ reset_states();
+ audio_length_samples = static_cast(input_wav.size());
+ // Process audio in chunks of window_size_samples (e.g., 512 samples)
+ for (size_t j = 0; j < static_cast(audio_length_samples); j += static_cast(window_size_samples)) {
+ if (j + static_cast(window_size_samples) > static_cast(audio_length_samples))
+ break;
+ std::vector chunk(&input_wav[j], &input_wav[j] + window_size_samples);
+ predict(chunk);
+ }
+ if (current_speech.start >= 0) {
+ current_speech.end = audio_length_samples;
+ speeches.push_back(current_speech);
+ current_speech = timestamp_t();
+ prev_end = 0;
+ next_start = 0;
+ temp_end = 0;
+ triggered = false;
+ }
+ }
+
+ // Returns the detected speech timestamps.
+ const std::vector get_speech_timestamps() const {
+ return speeches;
+ }
+
+ // Public method to reset the internal state.
+ void reset() {
+ reset_states();
+ }
+
+public:
+ // Constructor: sets model path, sample rate, window size (ms), and other parameters.
+ // The parameters are set to match the Python version.
+ VadIterator(const std::wstring ModelPath,
+ int Sample_rate = 16000, int windows_frame_size = 32,
+ float Threshold = 0.5, int min_silence_duration_ms = 100,
+ int speech_pad_ms = 30, int min_speech_duration_ms = 250,
+ float max_speech_duration_s = std::numeric_limits::infinity())
+ : sample_rate(Sample_rate), threshold(Threshold), speech_pad_samples(speech_pad_ms), prev_end(0)
+ {
+ sr_per_ms = sample_rate / 1000; // e.g., 16000 / 1000 = 16
+ window_size_samples = windows_frame_size * sr_per_ms; // e.g., 32ms * 16 = 512 samples
+ effective_window_size = window_size_samples + context_samples; // e.g., 512 + 64 = 576 samples
+ input_node_dims[0] = 1;
+ input_node_dims[1] = effective_window_size;
+ _state.resize(size_state);
+ sr.resize(1);
+ sr[0] = sample_rate;
+ _context.assign(context_samples, 0.0f);
+ min_speech_samples = sr_per_ms * min_speech_duration_ms;
+ max_speech_samples = (sample_rate * max_speech_duration_s - window_size_samples - 2 * speech_pad_samples);
+ min_silence_samples = sr_per_ms * min_silence_duration_ms;
+ min_silence_samples_at_max_speech = sr_per_ms * 98;
+ init_onnx_model(ModelPath);
+ }
+};
+
+int main() {
+ // Read the WAV file (expects 16000 Hz, mono, PCM).
+ wav::WavReader wav_reader("audio/recorder.wav"); // File located in the "audio" folder.
+ int numSamples = wav_reader.num_samples();
+ std::vector input_wav(static_cast(numSamples));
+ for (size_t i = 0; i < static_cast(numSamples); i++) {
+ input_wav[i] = static_cast(*(wav_reader.data() + i));
+ }
+
+ // Set the ONNX model path (file located in the "model" folder).
+ std::wstring model_path = L"model/silero_vad.onnx";
+
+ // Initialize the VadIterator.
+ VadIterator vad(model_path);
+
+ // Process the audio.
+ vad.process(input_wav);
+
+ // Retrieve the speech timestamps (in samples).
+ std::vector stamps = vad.get_speech_timestamps();
+
+ // Convert timestamps to seconds and round to one decimal place (for 16000 Hz).
+ const float sample_rate_float = 16000.0f;
+ for (size_t i = 0; i < stamps.size(); i++) {
+ float start_sec = std::rint((stamps[i].start / sample_rate_float) * 10.0f) / 10.0f;
+ float end_sec = std::rint((stamps[i].end / sample_rate_float) * 10.0f) / 10.0f;
+ std::cout << "Speech detected from "
+ << std::fixed << std::setprecision(1) << start_sec
+ << " s to "
+ << std::fixed << std::setprecision(1) << end_sec
+ << " s" << std::endl;
+ }
+
+ // Optionally, reset the internal state.
+ vad.reset();
+
+ return 0;
+}
diff --git a/silero-vad/examples/cpp/wav.h b/silero-vad/examples/cpp/wav.h
new file mode 100644
index 0000000..e6ef442
--- /dev/null
+++ b/silero-vad/examples/cpp/wav.h
@@ -0,0 +1,237 @@
+// Copyright (c) 2016 Personal (Binbin Zhang)
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+// http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#ifndef FRONTEND_WAV_H_
+#define FRONTEND_WAV_H_
+
+
+#include
+#include
+#include
+#include
+#include
+
+#include
+
+#include
+
+// #include "utils/log.h"
+
+namespace wav {
+
+struct WavHeader {
+ char riff[4]; // "riff"
+ unsigned int size;
+ char wav[4]; // "WAVE"
+ char fmt[4]; // "fmt "
+ unsigned int fmt_size;
+ uint16_t format;
+ uint16_t channels;
+ unsigned int sample_rate;
+ unsigned int bytes_per_second;
+ uint16_t block_size;
+ uint16_t bit;
+ char data[4]; // "data"
+ unsigned int data_size;
+};
+
+class WavReader {
+ public:
+ WavReader() : data_(nullptr) {}
+ explicit WavReader(const std::string& filename) { Open(filename); }
+
+ bool Open(const std::string& filename) {
+ FILE* fp = fopen(filename.c_str(), "rb"); //文件读取
+ if (NULL == fp) {
+ std::cout << "Error in read " << filename;
+ return false;
+ }
+
+ WavHeader header;
+ fread(&header, 1, sizeof(header), fp);
+ if (header.fmt_size < 16) {
+ printf("WaveData: expect PCM format data "
+ "to have fmt chunk of at least size 16.\n");
+ return false;
+ } else if (header.fmt_size > 16) {
+ int offset = 44 - 8 + header.fmt_size - 16;
+ fseek(fp, offset, SEEK_SET);
+ fread(header.data, 8, sizeof(char), fp);
+ }
+ // check "riff" "WAVE" "fmt " "data"
+
+ // Skip any sub-chunks between "fmt" and "data". Usually there will
+ // be a single "fact" sub chunk, but on Windows there can also be a
+ // "list" sub chunk.
+ while (0 != strncmp(header.data, "data", 4)) {
+ // We will just ignore the data in these chunks.
+ fseek(fp, header.data_size, SEEK_CUR);
+ // read next sub chunk
+ fread(header.data, 8, sizeof(char), fp);
+ }
+
+ if (header.data_size == 0) {
+ int offset = ftell(fp);
+ fseek(fp, 0, SEEK_END);
+ header.data_size = ftell(fp) - offset;
+ fseek(fp, offset, SEEK_SET);
+ }
+
+ num_channel_ = header.channels;
+ sample_rate_ = header.sample_rate;
+ bits_per_sample_ = header.bit;
+ int num_data = header.data_size / (bits_per_sample_ / 8);
+ data_ = new float[num_data]; // Create 1-dim array
+ num_samples_ = num_data / num_channel_;
+
+ std::cout << "num_channel_ :" << num_channel_ << std::endl;
+ std::cout << "sample_rate_ :" << sample_rate_ << std::endl;
+ std::cout << "bits_per_sample_:" << bits_per_sample_ << std::endl;
+ std::cout << "num_samples :" << num_data << std::endl;
+ std::cout << "num_data_size :" << header.data_size << std::endl;
+
+ switch (bits_per_sample_) {
+ case 8: {
+ char sample;
+ for (int i = 0; i < num_data; ++i) {
+ fread(&sample, 1, sizeof(char), fp);
+ data_[i] = static_cast(sample) / 32768;
+ }
+ break;
+ }
+ case 16: {
+ int16_t sample;
+ for (int i = 0; i < num_data; ++i) {
+ fread(&sample, 1, sizeof(int16_t), fp);
+ data_[i] = static_cast(sample) / 32768;
+ }
+ break;
+ }
+ case 32:
+ {
+ if (header.format == 1) //S32
+ {
+ int sample;
+ for (int i = 0; i < num_data; ++i) {
+ fread(&sample, 1, sizeof(int), fp);
+ data_[i] = static_cast(sample) / 32768;
+ }
+ }
+ else if (header.format == 3) // IEEE-float
+ {
+ float sample;
+ for (int i = 0; i < num_data; ++i) {
+ fread(&sample, 1, sizeof(float), fp);
+ data_[i] = static_cast(sample);
+ }
+ }
+ else {
+ printf("unsupported quantization bits\n");
+ }
+ break;
+ }
+ default:
+ printf("unsupported quantization bits\n");
+ break;
+ }
+
+ fclose(fp);
+ return true;
+ }
+
+ int num_channel() const { return num_channel_; }
+ int sample_rate() const { return sample_rate_; }
+ int bits_per_sample() const { return bits_per_sample_; }
+ int num_samples() const { return num_samples_; }
+
+ ~WavReader() {
+ delete[] data_;
+ }
+
+ const float* data() const { return data_; }
+
+ private:
+ int num_channel_;
+ int sample_rate_;
+ int bits_per_sample_;
+ int num_samples_; // sample points per channel
+ float* data_;
+};
+
+class WavWriter {
+ public:
+ WavWriter(const float* data, int num_samples, int num_channel,
+ int sample_rate, int bits_per_sample)
+ : data_(data),
+ num_samples_(num_samples),
+ num_channel_(num_channel),
+ sample_rate_(sample_rate),
+ bits_per_sample_(bits_per_sample) {}
+
+ void Write(const std::string& filename) {
+ FILE* fp = fopen(filename.c_str(), "w");
+ // init char 'riff' 'WAVE' 'fmt ' 'data'
+ WavHeader header;
+ char wav_header[44] = {0x52, 0x49, 0x46, 0x46, 0x00, 0x00, 0x00, 0x00, 0x57,
+ 0x41, 0x56, 0x45, 0x66, 0x6d, 0x74, 0x20, 0x10, 0x00,
+ 0x00, 0x00, 0x01, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
+ 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
+ 0x64, 0x61, 0x74, 0x61, 0x00, 0x00, 0x00, 0x00};
+ memcpy(&header, wav_header, sizeof(header));
+ header.channels = num_channel_;
+ header.bit = bits_per_sample_;
+ header.sample_rate = sample_rate_;
+ header.data_size = num_samples_ * num_channel_ * (bits_per_sample_ / 8);
+ header.size = sizeof(header) - 8 + header.data_size;
+ header.bytes_per_second =
+ sample_rate_ * num_channel_ * (bits_per_sample_ / 8);
+ header.block_size = num_channel_ * (bits_per_sample_ / 8);
+
+ fwrite(&header, 1, sizeof(header), fp);
+
+ for (int i = 0; i < num_samples_; ++i) {
+ for (int j = 0; j < num_channel_; ++j) {
+ switch (bits_per_sample_) {
+ case 8: {
+ char sample = static_cast(data_[i * num_channel_ + j]);
+ fwrite(&sample, 1, sizeof(sample), fp);
+ break;
+ }
+ case 16: {
+ int16_t sample = static_cast(data_[i * num_channel_ + j]);
+ fwrite(&sample, 1, sizeof(sample), fp);
+ break;
+ }
+ case 32: {
+ int sample = static_cast(data_[i * num_channel_ + j]);
+ fwrite(&sample, 1, sizeof(sample), fp);
+ break;
+ }
+ }
+ }
+ }
+ fclose(fp);
+ }
+
+ private:
+ const float* data_;
+ int num_samples_; // total float points in data_
+ int num_channel_;
+ int sample_rate_;
+ int bits_per_sample_;
+};
+
+} // namespace wav
+
+#endif // FRONTEND_WAV_H_
diff --git a/silero-vad/examples/cpp_libtorch/README.md b/silero-vad/examples/cpp_libtorch/README.md
new file mode 100644
index 0000000..be18cee
--- /dev/null
+++ b/silero-vad/examples/cpp_libtorch/README.md
@@ -0,0 +1,45 @@
+# Silero-VAD V5 in C++ (based on LibTorch)
+
+This is the source code for Silero-VAD V5 in C++, utilizing LibTorch. The primary implementation is CPU-based, and you should compare its results with the Python version. Only results at 16kHz have been tested.
+
+Additionally, batch and CUDA inference options are available if you want to explore further. Note that when using batch inference, the speech probabilities may slightly differ from the standard version, likely due to differences in caching. Unlike individual input processing, batch inference may not use the cache from previous chunks. Despite this, batch inference offers significantly faster processing. For optimal performance, consider adjusting the threshold when using batch inference.
+
+## Requirements
+
+- GCC 11.4.0 (GCC >= 5.1)
+- LibTorch 1.13.0 (other versions are also acceptable)
+
+## Download LibTorch
+
+```bash
+-CPU Version
+wget https://download.pytorch.org/libtorch/cpu/libtorch-shared-with-deps-1.13.0%2Bcpu.zip
+unzip libtorch-shared-with-deps-1.13.0+cpu.zip'
+
+-CUDA Version
+wget https://download.pytorch.org/libtorch/cu116/libtorch-shared-with-deps-1.13.0%2Bcu116.zip
+unzip libtorch-shared-with-deps-1.13.0+cu116.zip
+```
+
+## Compilation
+
+```bash
+-CPU Version
+g++ main.cc silero_torch.cc -I ./libtorch/include/ -I ./libtorch/include/torch/csrc/api/include -L ./libtorch/lib/ -ltorch -ltorch_cpu -lc10 -Wl,-rpath,./libtorch/lib/ -o silero -std=c++14 -D_GLIBCXX_USE_CXX11_ABI=0
+
+-CUDA Version
+g++ main.cc silero_torch.cc -I ./libtorch/include/ -I ./libtorch/include/torch/csrc/api/include -L ./libtorch/lib/ -ltorch -ltorch_cuda -ltorch_cpu -lc10 -Wl,-rpath,./libtorch/lib/ -o silero -std=c++14 -D_GLIBCXX_USE_CXX11_ABI=0 -DUSE_GPU
+```
+
+
+## Optional Compilation Flags
+-DUSE_BATCH: Enable batch inference
+-DUSE_GPU: Use GPU for inference
+
+## Run the Program
+To run the program, use the following command:
+
+`./silero aepyx.wav 16000 0.5`
+
+The sample file aepyx.wav is part of the Voxconverse dataset.
+File details: aepyx.wav is a 16kHz, 16-bit audio file.
diff --git a/silero-vad/examples/cpp_libtorch/aepyx.wav b/silero-vad/examples/cpp_libtorch/aepyx.wav
new file mode 100644
index 0000000..b8b46de
Binary files /dev/null and b/silero-vad/examples/cpp_libtorch/aepyx.wav differ
diff --git a/silero-vad/examples/cpp_libtorch/main.cc b/silero-vad/examples/cpp_libtorch/main.cc
new file mode 100644
index 0000000..3f774e7
--- /dev/null
+++ b/silero-vad/examples/cpp_libtorch/main.cc
@@ -0,0 +1,54 @@
+#include
+#include "silero_torch.h"
+#include "wav.h"
+
+int main(int argc, char* argv[]) {
+
+ if(argc != 4){
+ std::cerr<<"Usage : "<"< input_wav(wav_reader.num_samples());
+
+ for (int i = 0; i < wav_reader.num_samples(); i++)
+ {
+ input_wav[i] = static_cast(*(wav_reader.data() + i));
+ }
+
+ vad.SpeechProbs(input_wav);
+
+ std::vector speeches = vad.GetSpeechTimestamps();
+ for(const auto& speech : speeches){
+ if(vad.print_as_samples){
+ std::cout<<"{'start': "<(speech.start)<<", 'end': "<(speech.end)<<"}"<& input_wav){
+ // Set the sample rate (must match the model's expected sample rate)
+ // Process the waveform in chunks of 512 samples
+ int num_samples = input_wav.size();
+ int num_chunks = num_samples / window_size_samples;
+ int remainder_samples = num_samples % window_size_samples;
+
+ total_sample_size += num_samples;
+
+ torch::Tensor output;
+ std::vector chunks;
+
+ for (int i = 0; i < num_chunks; i++) {
+
+ float* chunk_start = input_wav.data() + i *window_size_samples;
+ torch::Tensor chunk = torch::from_blob(chunk_start, {1,window_size_samples}, torch::kFloat32);
+ //std::cout<<"chunk size : "<0){//마지막 chunk && 나머지가 존재
+ int remaining_samples = num_samples - num_chunks * window_size_samples;
+ //std::cout<<"Remainder size : "< inputs;
+ inputs.push_back(batched_chunks); // Batch of chunks
+ inputs.push_back(sample_rate); // Assuming sample_rate is a valid input for the model
+
+ // Run inference on the batch
+ torch::NoGradGuard no_grad;
+ torch::Tensor output = model.forward(inputs).toTensor();
+#ifdef USE_GPU
+ output = output.to(at::kCPU); // Move the output back to CPU once
+#endif
+ // Collect output probabilities
+ for (int i = 0; i < chunks.size(); i++) {
+ float output_f = output[i].item();
+ outputs_prob.push_back(output_f);
+ //std::cout << "Chunk " << i << " prob: " << output_f<< "\n";
+ }
+#else
+
+ std::vector outputs;
+ torch::Tensor batched_chunks = torch::stack(chunks);
+#ifdef USE_GPU
+ batched_chunks = batched_chunks.to(at::kCUDA);
+#endif
+ for (int i = 0; i < chunks.size(); i++) {
+ torch::NoGradGuard no_grad;
+ std::vector inputs;
+ inputs.push_back(batched_chunks[i]);
+ inputs.push_back(sample_rate);
+
+ torch::Tensor output = model.forward(inputs).toTensor();
+ outputs.push_back(output);
+ }
+ torch::Tensor all_outputs = torch::stack(outputs);
+#ifdef USE_GPU
+ all_outputs = all_outputs.to(at::kCPU);
+#endif
+ for (int i = 0; i < chunks.size(); i++) {
+ float output_f = all_outputs[i].item();
+ outputs_prob.push_back(output_f);
+ }
+
+
+
+#endif
+
+ }
+
+
+ }
+
+
+ std::vector VadIterator::GetSpeechTimestamps() {
+ std::vector speeches = DoVad();
+
+#ifdef USE_BATCH
+ //When you use BATCH inference. You would better use 'mergeSpeeches' function to arrage time stamp.
+ //It could be better get reasonable output because of distorted probs.
+ duration_merge_samples = sample_rate * max_duration_merge_ms / 1000;
+ std::vector speeches_merge = mergeSpeeches(speeches, duration_merge_samples);
+ if(!print_as_samples){
+ for (auto& speech : speeches_merge) { //samples to second
+ speech.start /= sample_rate;
+ speech.end /= sample_rate;
+ }
+ }
+
+ return speeches_merge;
+#else
+
+ if(!print_as_samples){
+ for (auto& speech : speeches) { //samples to second
+ speech.start /= sample_rate;
+ speech.end /= sample_rate;
+ }
+ }
+
+ return speeches;
+
+#endif
+
+ }
+ void VadIterator::SetVariables(){
+ init_engine(window_size_ms);
+ }
+
+ void VadIterator::init_engine(int window_size_ms) {
+ min_silence_samples = sample_rate * min_silence_duration_ms / 1000;
+ speech_pad_samples = sample_rate * speech_pad_ms / 1000;
+ window_size_samples = sample_rate / 1000 * window_size_ms;
+ min_speech_samples = sample_rate * min_speech_duration_ms / 1000;
+ }
+
+ void VadIterator::init_torch_model(const std::string& model_path) {
+ at::set_num_threads(1);
+ model = torch::jit::load(model_path);
+
+#ifdef USE_GPU
+ if (!torch::cuda::is_available()) {
+ std::cout<<"CUDA is not available! Please check your GPU settings"< VadIterator::DoVad() {
+ std::vector speeches;
+
+ for (size_t i = 0; i < outputs_prob.size(); ++i) {
+ float speech_prob = outputs_prob[i];
+ //std::cout << speech_prob << std::endl;
+ //std::cout << "Chunk " << i << " Prob: " << speech_prob << "\n";
+ //std::cout << speech_prob << " ";
+ current_sample += window_size_samples;
+
+ if (speech_prob >= threshold && temp_end != 0) {
+ temp_end = 0;
+ }
+
+ if (speech_prob >= threshold && !triggered) {
+ triggered = true;
+ SpeechSegment segment;
+ segment.start = std::max(static_cast(0), current_sample - speech_pad_samples - window_size_samples);
+ speeches.push_back(segment);
+ continue;
+ }
+
+ if (speech_prob < threshold - 0.15f && triggered) {
+ if (temp_end == 0) {
+ temp_end = current_sample;
+ }
+
+ if (current_sample - temp_end < min_silence_samples) {
+ continue;
+ } else {
+ SpeechSegment& segment = speeches.back();
+ segment.end = temp_end + speech_pad_samples - window_size_samples;
+ temp_end = 0;
+ triggered = false;
+ }
+ }
+ }
+
+ if (triggered) { //만약 낮은 확률을 보이다가 마지막프레임 prbos만 딱 확률이 높게 나오면 위에서 triggerd = true 메핑과 동시에 segment start가 돼서 문제가 될것 같은데? start = end 같은값? 후처리가 있으니 문제가 없으려나?
+ std::cout<<"when last triggered is keep working until last Probs"<speech_pad_samples) - (speech.start + this->speech_pad_samples) < min_speech_samples);
+ //min_speech_samples is 4000samples(0.25sec)
+ //여기서 포인트!! 계산 할때는 start,end sample에'speech_pad_samples' 사이즈를 추가한후 길이를 측정함.
+ }
+ ),
+ speeches.end()
+ );
+
+
+ //std::cout< VadIterator::mergeSpeeches(const std::vector& speeches, int duration_merge_samples) {
+ std::vector mergedSpeeches;
+
+ if (speeches.empty()) {
+ return mergedSpeeches; // 빈 벡터 반환
+ }
+
+ // 첫 번째 구간으로 초기화
+ SpeechSegment currentSegment = speeches[0];
+
+ for (size_t i = 1; i < speeches.size(); ++i) { //첫번째 start,end 정보 건너뛰기. 그래서 i=1부터
+ // 두 구간의 차이가 threshold(duration_merge_samples)보다 작은 경우, 합침
+ if (speeches[i].start - currentSegment.end < duration_merge_samples) {
+ // 현재 구간의 끝점을 업데이트
+ currentSegment.end = speeches[i].end;
+ } else {
+ // 차이가 threshold(duration_merge_samples) 이상이면 현재 구간을 저장하고 새로운 구간 시작
+ mergedSpeeches.push_back(currentSegment);
+ currentSegment = speeches[i];
+ }
+ }
+
+ // 마지막 구간 추가
+ mergedSpeeches.push_back(currentSegment);
+
+ return mergedSpeeches;
+ }
+
+ }
diff --git a/silero-vad/examples/cpp_libtorch/silero_torch.h b/silero-vad/examples/cpp_libtorch/silero_torch.h
new file mode 100644
index 0000000..d8d3bc7
--- /dev/null
+++ b/silero-vad/examples/cpp_libtorch/silero_torch.h
@@ -0,0 +1,75 @@
+//Author : Nathan Lee
+//Created On : 2024-11-18
+//Description : silero 5.1 system for torch-script(c++).
+//Version : 1.0
+
+#ifndef SILERO_TORCH_H
+#define SILERO_TORCH_H
+
+#include
+#include
+#include
+#include
+#include
+#include
+#include
+#include
+
+#include
+#include
+
+
+namespace silero{
+
+ struct SpeechSegment{
+ int start;
+ int end;
+ };
+
+ class VadIterator{
+ public:
+
+ VadIterator(const std::string &model_path, float threshold = 0.5, int sample_rate = 16000,
+ int window_size_ms = 32, int speech_pad_ms = 30, int min_silence_duration_ms = 100,
+ int min_speech_duration_ms = 250, int max_duration_merge_ms = 300, bool print_as_samples = false);
+ ~VadIterator();
+
+
+ void SpeechProbs(std::vector& input_wav);
+ std::vector GetSpeechTimestamps();
+ void SetVariables();
+
+ float threshold;
+ int sample_rate;
+ int window_size_ms;
+ int min_speech_duration_ms;
+ int max_duration_merge_ms;
+ bool print_as_samples;
+
+ private:
+ torch::jit::script::Module model;
+ std::vector outputs_prob;
+ int min_silence_samples;
+ int min_speech_samples;
+ int speech_pad_samples;
+ int window_size_samples;
+ int duration_merge_samples;
+ int current_sample = 0;
+
+ int total_sample_size=0;
+
+ int min_silence_duration_ms;
+ int speech_pad_ms;
+ bool triggered = false;
+ int temp_end = 0;
+
+ void init_engine(int window_size_ms);
+ void init_torch_model(const std::string& model_path);
+ void reset_states();
+ std::vector DoVad();
+ std::vector mergeSpeeches(const std::vector& speeches, int duration_merge_samples);
+
+ };
+
+}
+#endif // SILERO_TORCH_H
diff --git a/silero-vad/examples/cpp_libtorch/wav.h b/silero-vad/examples/cpp_libtorch/wav.h
new file mode 100644
index 0000000..249d7e3
--- /dev/null
+++ b/silero-vad/examples/cpp_libtorch/wav.h
@@ -0,0 +1,235 @@
+// Copyright (c) 2016 Personal (Binbin Zhang)
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+// http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+
+#ifndef FRONTEND_WAV_H_
+#define FRONTEND_WAV_H_
+
+#include
+#include
+#include
+#include
+#include
+
+#include
+
+// #include "utils/log.h"
+
+namespace wav {
+
+struct WavHeader {
+ char riff[4]; // "riff"
+ unsigned int size;
+ char wav[4]; // "WAVE"
+ char fmt[4]; // "fmt "
+ unsigned int fmt_size;
+ uint16_t format;
+ uint16_t channels;
+ unsigned int sample_rate;
+ unsigned int bytes_per_second;
+ uint16_t block_size;
+ uint16_t bit;
+ char data[4]; // "data"
+ unsigned int data_size;
+};
+
+class WavReader {
+ public:
+ WavReader() : data_(nullptr) {}
+ explicit WavReader(const std::string& filename) { Open(filename); }
+
+ bool Open(const std::string& filename) {
+ FILE* fp = fopen(filename.c_str(), "rb"); //文件读取
+ if (NULL == fp) {
+ std::cout << "Error in read " << filename;
+ return false;
+ }
+
+ WavHeader header;
+ fread(&header, 1, sizeof(header), fp);
+ if (header.fmt_size < 16) {
+ printf("WaveData: expect PCM format data "
+ "to have fmt chunk of at least size 16.\n");
+ return false;
+ } else if (header.fmt_size > 16) {
+ int offset = 44 - 8 + header.fmt_size - 16;
+ fseek(fp, offset, SEEK_SET);
+ fread(header.data, 8, sizeof(char), fp);
+ }
+ // check "riff" "WAVE" "fmt " "data"
+
+ // Skip any sub-chunks between "fmt" and "data". Usually there will
+ // be a single "fact" sub chunk, but on Windows there can also be a
+ // "list" sub chunk.
+ while (0 != strncmp(header.data, "data", 4)) {
+ // We will just ignore the data in these chunks.
+ fseek(fp, header.data_size, SEEK_CUR);
+ // read next sub chunk
+ fread(header.data, 8, sizeof(char), fp);
+ }
+
+ if (header.data_size == 0) {
+ int offset = ftell(fp);
+ fseek(fp, 0, SEEK_END);
+ header.data_size = ftell(fp) - offset;
+ fseek(fp, offset, SEEK_SET);
+ }
+
+ num_channel_ = header.channels;
+ sample_rate_ = header.sample_rate;
+ bits_per_sample_ = header.bit;
+ int num_data = header.data_size / (bits_per_sample_ / 8);
+ data_ = new float[num_data]; // Create 1-dim array
+ num_samples_ = num_data / num_channel_;
+
+ std::cout << "num_channel_ :" << num_channel_ << std::endl;
+ std::cout << "sample_rate_ :" << sample_rate_ << std::endl;
+ std::cout << "bits_per_sample_:" << bits_per_sample_ << std::endl;
+ std::cout << "num_samples :" << num_data << std::endl;
+ std::cout << "num_data_size :" << header.data_size << std::endl;
+
+ switch (bits_per_sample_) {
+ case 8: {
+ char sample;
+ for (int i = 0; i < num_data; ++i) {
+ fread(&sample, 1, sizeof(char), fp);
+ data_[i] = static_cast(sample) / 32768;
+ }
+ break;
+ }
+ case 16: {
+ int16_t sample;
+ for (int i = 0; i < num_data; ++i) {
+ fread(&sample, 1, sizeof(int16_t), fp);
+ data_[i] = static_cast(sample) / 32768;
+ }
+ break;
+ }
+ case 32:
+ {
+ if (header.format == 1) //S32
+ {
+ int sample;
+ for (int i = 0; i < num_data; ++i) {
+ fread(&sample, 1, sizeof(int), fp);
+ data_[i] = static_cast(sample) / 32768;
+ }
+ }
+ else if (header.format == 3) // IEEE-float
+ {
+ float sample;
+ for (int i = 0; i < num_data; ++i) {
+ fread(&sample, 1, sizeof(float), fp);
+ data_[i] = static_cast(sample);
+ }
+ }
+ else {
+ printf("unsupported quantization bits\n");
+ }
+ break;
+ }
+ default:
+ printf("unsupported quantization bits\n");
+ break;
+ }
+
+ fclose(fp);
+ return true;
+ }
+
+ int num_channel() const { return num_channel_; }
+ int sample_rate() const { return sample_rate_; }
+ int bits_per_sample() const { return bits_per_sample_; }
+ int num_samples() const { return num_samples_; }
+
+ ~WavReader() {
+ delete[] data_;
+ }
+
+ const float* data() const { return data_; }
+
+ private:
+ int num_channel_;
+ int sample_rate_;
+ int bits_per_sample_;
+ int num_samples_; // sample points per channel
+ float* data_;
+};
+
+class WavWriter {
+ public:
+ WavWriter(const float* data, int num_samples, int num_channel,
+ int sample_rate, int bits_per_sample)
+ : data_(data),
+ num_samples_(num_samples),
+ num_channel_(num_channel),
+ sample_rate_(sample_rate),
+ bits_per_sample_(bits_per_sample) {}
+
+ void Write(const std::string& filename) {
+ FILE* fp = fopen(filename.c_str(), "w");
+ // init char 'riff' 'WAVE' 'fmt ' 'data'
+ WavHeader header;
+ char wav_header[44] = {0x52, 0x49, 0x46, 0x46, 0x00, 0x00, 0x00, 0x00, 0x57,
+ 0x41, 0x56, 0x45, 0x66, 0x6d, 0x74, 0x20, 0x10, 0x00,
+ 0x00, 0x00, 0x01, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
+ 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
+ 0x64, 0x61, 0x74, 0x61, 0x00, 0x00, 0x00, 0x00};
+ memcpy(&header, wav_header, sizeof(header));
+ header.channels = num_channel_;
+ header.bit = bits_per_sample_;
+ header.sample_rate = sample_rate_;
+ header.data_size = num_samples_ * num_channel_ * (bits_per_sample_ / 8);
+ header.size = sizeof(header) - 8 + header.data_size;
+ header.bytes_per_second =
+ sample_rate_ * num_channel_ * (bits_per_sample_ / 8);
+ header.block_size = num_channel_ * (bits_per_sample_ / 8);
+
+ fwrite(&header, 1, sizeof(header), fp);
+
+ for (int i = 0; i < num_samples_; ++i) {
+ for (int j = 0; j < num_channel_; ++j) {
+ switch (bits_per_sample_) {
+ case 8: {
+ char sample = static_cast(data_[i * num_channel_ + j]);
+ fwrite(&sample, 1, sizeof(sample), fp);
+ break;
+ }
+ case 16: {
+ int16_t sample = static_cast(data_[i * num_channel_ + j]);
+ fwrite(&sample, 1, sizeof(sample), fp);
+ break;
+ }
+ case 32: {
+ int sample = static_cast(data_[i * num_channel_ + j]);
+ fwrite(&sample, 1, sizeof(sample), fp);
+ break;
+ }
+ }
+ }
+ }
+ fclose(fp);
+ }
+
+ private:
+ const float* data_;
+ int num_samples_; // total float points in data_
+ int num_channel_;
+ int sample_rate_;
+ int bits_per_sample_;
+};
+
+} // namespace wenet
+
+#endif // FRONTEND_WAV_H_
diff --git a/silero-vad/examples/cpp_libtorch_deprecated/README.md b/silero-vad/examples/cpp_libtorch_deprecated/README.md
new file mode 100644
index 0000000..be18cee
--- /dev/null
+++ b/silero-vad/examples/cpp_libtorch_deprecated/README.md
@@ -0,0 +1,45 @@
+# Silero-VAD V5 in C++ (based on LibTorch)
+
+This is the source code for Silero-VAD V5 in C++, utilizing LibTorch. The primary implementation is CPU-based, and you should compare its results with the Python version. Only results at 16kHz have been tested.
+
+Additionally, batch and CUDA inference options are available if you want to explore further. Note that when using batch inference, the speech probabilities may slightly differ from the standard version, likely due to differences in caching. Unlike individual input processing, batch inference may not use the cache from previous chunks. Despite this, batch inference offers significantly faster processing. For optimal performance, consider adjusting the threshold when using batch inference.
+
+## Requirements
+
+- GCC 11.4.0 (GCC >= 5.1)
+- LibTorch 1.13.0 (other versions are also acceptable)
+
+## Download LibTorch
+
+```bash
+-CPU Version
+wget https://download.pytorch.org/libtorch/cpu/libtorch-shared-with-deps-1.13.0%2Bcpu.zip
+unzip libtorch-shared-with-deps-1.13.0+cpu.zip'
+
+-CUDA Version
+wget https://download.pytorch.org/libtorch/cu116/libtorch-shared-with-deps-1.13.0%2Bcu116.zip
+unzip libtorch-shared-with-deps-1.13.0+cu116.zip
+```
+
+## Compilation
+
+```bash
+-CPU Version
+g++ main.cc silero_torch.cc -I ./libtorch/include/ -I ./libtorch/include/torch/csrc/api/include -L ./libtorch/lib/ -ltorch -ltorch_cpu -lc10 -Wl,-rpath,./libtorch/lib/ -o silero -std=c++14 -D_GLIBCXX_USE_CXX11_ABI=0
+
+-CUDA Version
+g++ main.cc silero_torch.cc -I ./libtorch/include/ -I ./libtorch/include/torch/csrc/api/include -L ./libtorch/lib/ -ltorch -ltorch_cuda -ltorch_cpu -lc10 -Wl,-rpath,./libtorch/lib/ -o silero -std=c++14 -D_GLIBCXX_USE_CXX11_ABI=0 -DUSE_GPU
+```
+
+
+## Optional Compilation Flags
+-DUSE_BATCH: Enable batch inference
+-DUSE_GPU: Use GPU for inference
+
+## Run the Program
+To run the program, use the following command:
+
+`./silero aepyx.wav 16000 0.5`
+
+The sample file aepyx.wav is part of the Voxconverse dataset.
+File details: aepyx.wav is a 16kHz, 16-bit audio file.
diff --git a/silero-vad/examples/cpp_libtorch_deprecated/aepyx.wav b/silero-vad/examples/cpp_libtorch_deprecated/aepyx.wav
new file mode 100644
index 0000000..b8b46de
Binary files /dev/null and b/silero-vad/examples/cpp_libtorch_deprecated/aepyx.wav differ
diff --git a/silero-vad/examples/cpp_libtorch_deprecated/main.cc b/silero-vad/examples/cpp_libtorch_deprecated/main.cc
new file mode 100644
index 0000000..3f774e7
--- /dev/null
+++ b/silero-vad/examples/cpp_libtorch_deprecated/main.cc
@@ -0,0 +1,54 @@
+#include
+#include "silero_torch.h"
+#include "wav.h"
+
+int main(int argc, char* argv[]) {
+
+ if(argc != 4){
+ std::cerr<<"Usage : "<"< input_wav(wav_reader.num_samples());
+
+ for (int i = 0; i < wav_reader.num_samples(); i++)
+ {
+ input_wav[i] = static_cast(*(wav_reader.data() + i));
+ }
+
+ vad.SpeechProbs(input_wav);
+
+ std::vector speeches = vad.GetSpeechTimestamps();
+ for(const auto& speech : speeches){
+ if(vad.print_as_samples){
+ std::cout<<"{'start': "<(speech.start)<<", 'end': "<(speech.end)<<"}"<& input_wav){
+ // Set the sample rate (must match the model's expected sample rate)
+ // Process the waveform in chunks of 512 samples
+ int num_samples = input_wav.size();
+ int num_chunks = num_samples / window_size_samples;
+ int remainder_samples = num_samples % window_size_samples;
+
+ total_sample_size += num_samples;
+
+ torch::Tensor output;
+ std::vector chunks;
+
+ for (int i = 0; i < num_chunks; i++) {
+
+ float* chunk_start = input_wav.data() + i *window_size_samples;
+ torch::Tensor chunk = torch::from_blob(chunk_start, {1,window_size_samples}, torch::kFloat32);
+ //std::cout<<"chunk size : "<0){//마지막 chunk && 나머지가 존재
+ int remaining_samples = num_samples - num_chunks * window_size_samples;
+ //std::cout<<"Remainder size : "< inputs;
+ inputs.push_back(batched_chunks); // Batch of chunks
+ inputs.push_back(sample_rate); // Assuming sample_rate is a valid input for the model
+
+ // Run inference on the batch
+ torch::NoGradGuard no_grad;
+ torch::Tensor output = model.forward(inputs).toTensor();
+#ifdef USE_GPU
+ output = output.to(at::kCPU); // Move the output back to CPU once
+#endif
+ // Collect output probabilities
+ for (int i = 0; i < chunks.size(); i++) {
+ float output_f = output[i].item();
+ outputs_prob.push_back(output_f);
+ //std::cout << "Chunk " << i << " prob: " << output_f<< "\n";
+ }
+#else
+
+ std::vector outputs;
+ torch::Tensor batched_chunks = torch::stack(chunks);
+#ifdef USE_GPU
+ batched_chunks = batched_chunks.to(at::kCUDA);
+#endif
+ for (int i = 0; i < chunks.size(); i++) {
+ torch::NoGradGuard no_grad;
+ std::vector inputs;
+ inputs.push_back(batched_chunks[i]);
+ inputs.push_back(sample_rate);
+
+ torch::Tensor output = model.forward(inputs).toTensor();
+ outputs.push_back(output);
+ }
+ torch::Tensor all_outputs = torch::stack(outputs);
+#ifdef USE_GPU
+ all_outputs = all_outputs.to(at::kCPU);
+#endif
+ for (int i = 0; i < chunks.size(); i++) {
+ float output_f = all_outputs[i].item();
+ outputs_prob.push_back(output_f);
+ }
+
+
+
+#endif
+
+ }
+
+
+ }
+
+
+ std::vector VadIterator::GetSpeechTimestamps() {
+ std::vector speeches = DoVad();
+
+#ifdef USE_BATCH
+ //When you use BATCH inference. You would better use 'mergeSpeeches' function to arrage time stamp.
+ //It could be better get reasonable output because of distorted probs.
+ duration_merge_samples = sample_rate * max_duration_merge_ms / 1000;
+ std::vector speeches_merge = mergeSpeeches(speeches, duration_merge_samples);
+ if(!print_as_samples){
+ for (auto& speech : speeches_merge) { //samples to second
+ speech.start /= sample_rate;
+ speech.end /= sample_rate;
+ }
+ }
+
+ return speeches_merge;
+#else
+
+ if(!print_as_samples){
+ for (auto& speech : speeches) { //samples to second
+ speech.start /= sample_rate;
+ speech.end /= sample_rate;
+ }
+ }
+
+ return speeches;
+
+#endif
+
+ }
+ void VadIterator::SetVariables(){
+ init_engine(window_size_ms);
+ }
+
+ void VadIterator::init_engine(int window_size_ms) {
+ min_silence_samples = sample_rate * min_silence_duration_ms / 1000;
+ speech_pad_samples = sample_rate * speech_pad_ms / 1000;
+ window_size_samples = sample_rate / 1000 * window_size_ms;
+ min_speech_samples = sample_rate * min_speech_duration_ms / 1000;
+ }
+
+ void VadIterator::init_torch_model(const std::string& model_path) {
+ at::set_num_threads(1);
+ model = torch::jit::load(model_path);
+
+#ifdef USE_GPU
+ if (!torch::cuda::is_available()) {
+ std::cout<<"CUDA is not available! Please check your GPU settings"< VadIterator::DoVad() {
+ std::vector speeches;
+
+ for (size_t i = 0; i < outputs_prob.size(); ++i) {
+ float speech_prob = outputs_prob[i];
+ //std::cout << speech_prob << std::endl;
+ //std::cout << "Chunk " << i << " Prob: " << speech_prob << "\n";
+ //std::cout << speech_prob << " ";
+ current_sample += window_size_samples;
+
+ if (speech_prob >= threshold && temp_end != 0) {
+ temp_end = 0;
+ }
+
+ if (speech_prob >= threshold && !triggered) {
+ triggered = true;
+ SpeechSegment segment;
+ segment.start = std::max(static_cast(0), current_sample - speech_pad_samples - window_size_samples);
+ speeches.push_back(segment);
+ continue;
+ }
+
+ if (speech_prob < threshold - 0.15f && triggered) {
+ if (temp_end == 0) {
+ temp_end = current_sample;
+ }
+
+ if (current_sample - temp_end < min_silence_samples) {
+ continue;
+ } else {
+ SpeechSegment& segment = speeches.back();
+ segment.end = temp_end + speech_pad_samples - window_size_samples;
+ temp_end = 0;
+ triggered = false;
+ }
+ }
+ }
+
+ if (triggered) { //만약 낮은 확률을 보이다가 마지막프레임 prbos만 딱 확률이 높게 나오면 위에서 triggerd = true 메핑과 동시에 segment start가 돼서 문제가 될것 같은데? start = end 같은값? 후처리가 있으니 문제가 없으려나?
+ std::cout<<"when last triggered is keep working until last Probs"<speech_pad_samples) - (speech.start + this->speech_pad_samples) < min_speech_samples);
+ //min_speech_samples is 4000samples(0.25sec)
+ //여기서 포인트!! 계산 할때는 start,end sample에'speech_pad_samples' 사이즈를 추가한후 길이를 측정함.
+ }
+ ),
+ speeches.end()
+ );
+
+
+ //std::cout< VadIterator::mergeSpeeches(const std::vector& speeches, int duration_merge_samples) {
+ std::vector mergedSpeeches;
+
+ if (speeches.empty()) {
+ return mergedSpeeches; // 빈 벡터 반환
+ }
+
+ // 첫 번째 구간으로 초기화
+ SpeechSegment currentSegment = speeches[0];
+
+ for (size_t i = 1; i < speeches.size(); ++i) { //첫번째 start,end 정보 건너뛰기. 그래서 i=1부터
+ // 두 구간의 차이가 threshold(duration_merge_samples)보다 작은 경우, 합침
+ if (speeches[i].start - currentSegment.end < duration_merge_samples) {
+ // 현재 구간의 끝점을 업데이트
+ currentSegment.end = speeches[i].end;
+ } else {
+ // 차이가 threshold(duration_merge_samples) 이상이면 현재 구간을 저장하고 새로운 구간 시작
+ mergedSpeeches.push_back(currentSegment);
+ currentSegment = speeches[i];
+ }
+ }
+
+ // 마지막 구간 추가
+ mergedSpeeches.push_back(currentSegment);
+
+ return mergedSpeeches;
+ }
+
+ }
diff --git a/silero-vad/examples/cpp_libtorch_deprecated/silero_torch.h b/silero-vad/examples/cpp_libtorch_deprecated/silero_torch.h
new file mode 100644
index 0000000..d8d3bc7
--- /dev/null
+++ b/silero-vad/examples/cpp_libtorch_deprecated/silero_torch.h
@@ -0,0 +1,75 @@
+//Author : Nathan Lee
+//Created On : 2024-11-18
+//Description : silero 5.1 system for torch-script(c++).
+//Version : 1.0
+
+#ifndef SILERO_TORCH_H
+#define SILERO_TORCH_H
+
+#include
+#include
+#include
+#include
+#include
+#include
+#include
+#include
+
+#include
+#include
+
+
+namespace silero{
+
+ struct SpeechSegment{
+ int start;
+ int end;
+ };
+
+ class VadIterator{
+ public:
+
+ VadIterator(const std::string &model_path, float threshold = 0.5, int sample_rate = 16000,
+ int window_size_ms = 32, int speech_pad_ms = 30, int min_silence_duration_ms = 100,
+ int min_speech_duration_ms = 250, int max_duration_merge_ms = 300, bool print_as_samples = false);
+ ~VadIterator();
+
+
+ void SpeechProbs(std::vector& input_wav);
+ std::vector GetSpeechTimestamps();
+ void SetVariables();
+
+ float threshold;
+ int sample_rate;
+ int window_size_ms;
+ int min_speech_duration_ms;
+ int max_duration_merge_ms;
+ bool print_as_samples;
+
+ private:
+ torch::jit::script::Module model;
+ std::vector outputs_prob;
+ int min_silence_samples;
+ int min_speech_samples;
+ int speech_pad_samples;
+ int window_size_samples;
+ int duration_merge_samples;
+ int current_sample = 0;
+
+ int total_sample_size=0;
+
+ int min_silence_duration_ms;
+ int speech_pad_ms;
+ bool triggered = false;
+ int temp_end = 0;
+
+ void init_engine(int window_size_ms);
+ void init_torch_model(const std::string& model_path);
+ void reset_states();
+ std::vector DoVad();
+ std::vector mergeSpeeches(const std::vector& speeches, int duration_merge_samples);
+
+ };
+
+}
+#endif // SILERO_TORCH_H
diff --git a/silero-vad/examples/cpp_libtorch_deprecated/wav.h b/silero-vad/examples/cpp_libtorch_deprecated/wav.h
new file mode 100644
index 0000000..249d7e3
--- /dev/null
+++ b/silero-vad/examples/cpp_libtorch_deprecated/wav.h
@@ -0,0 +1,235 @@
+// Copyright (c) 2016 Personal (Binbin Zhang)
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+// http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+
+#ifndef FRONTEND_WAV_H_
+#define FRONTEND_WAV_H_
+
+#include
+#include
+#include
+#include
+#include
+
+#include
+
+// #include "utils/log.h"
+
+namespace wav {
+
+struct WavHeader {
+ char riff[4]; // "riff"
+ unsigned int size;
+ char wav[4]; // "WAVE"
+ char fmt[4]; // "fmt "
+ unsigned int fmt_size;
+ uint16_t format;
+ uint16_t channels;
+ unsigned int sample_rate;
+ unsigned int bytes_per_second;
+ uint16_t block_size;
+ uint16_t bit;
+ char data[4]; // "data"
+ unsigned int data_size;
+};
+
+class WavReader {
+ public:
+ WavReader() : data_(nullptr) {}
+ explicit WavReader(const std::string& filename) { Open(filename); }
+
+ bool Open(const std::string& filename) {
+ FILE* fp = fopen(filename.c_str(), "rb"); //文件读取
+ if (NULL == fp) {
+ std::cout << "Error in read " << filename;
+ return false;
+ }
+
+ WavHeader header;
+ fread(&header, 1, sizeof(header), fp);
+ if (header.fmt_size < 16) {
+ printf("WaveData: expect PCM format data "
+ "to have fmt chunk of at least size 16.\n");
+ return false;
+ } else if (header.fmt_size > 16) {
+ int offset = 44 - 8 + header.fmt_size - 16;
+ fseek(fp, offset, SEEK_SET);
+ fread(header.data, 8, sizeof(char), fp);
+ }
+ // check "riff" "WAVE" "fmt " "data"
+
+ // Skip any sub-chunks between "fmt" and "data". Usually there will
+ // be a single "fact" sub chunk, but on Windows there can also be a
+ // "list" sub chunk.
+ while (0 != strncmp(header.data, "data", 4)) {
+ // We will just ignore the data in these chunks.
+ fseek(fp, header.data_size, SEEK_CUR);
+ // read next sub chunk
+ fread(header.data, 8, sizeof(char), fp);
+ }
+
+ if (header.data_size == 0) {
+ int offset = ftell(fp);
+ fseek(fp, 0, SEEK_END);
+ header.data_size = ftell(fp) - offset;
+ fseek(fp, offset, SEEK_SET);
+ }
+
+ num_channel_ = header.channels;
+ sample_rate_ = header.sample_rate;
+ bits_per_sample_ = header.bit;
+ int num_data = header.data_size / (bits_per_sample_ / 8);
+ data_ = new float[num_data]; // Create 1-dim array
+ num_samples_ = num_data / num_channel_;
+
+ std::cout << "num_channel_ :" << num_channel_ << std::endl;
+ std::cout << "sample_rate_ :" << sample_rate_ << std::endl;
+ std::cout << "bits_per_sample_:" << bits_per_sample_ << std::endl;
+ std::cout << "num_samples :" << num_data << std::endl;
+ std::cout << "num_data_size :" << header.data_size << std::endl;
+
+ switch (bits_per_sample_) {
+ case 8: {
+ char sample;
+ for (int i = 0; i < num_data; ++i) {
+ fread(&sample, 1, sizeof(char), fp);
+ data_[i] = static_cast(sample) / 32768;
+ }
+ break;
+ }
+ case 16: {
+ int16_t sample;
+ for (int i = 0; i < num_data; ++i) {
+ fread(&sample, 1, sizeof(int16_t), fp);
+ data_[i] = static_cast(sample) / 32768;
+ }
+ break;
+ }
+ case 32:
+ {
+ if (header.format == 1) //S32
+ {
+ int sample;
+ for (int i = 0; i < num_data; ++i) {
+ fread(&sample, 1, sizeof(int), fp);
+ data_[i] = static_cast(sample) / 32768;
+ }
+ }
+ else if (header.format == 3) // IEEE-float
+ {
+ float sample;
+ for (int i = 0; i < num_data; ++i) {
+ fread(&sample, 1, sizeof(float), fp);
+ data_[i] = static_cast(sample);
+ }
+ }
+ else {
+ printf("unsupported quantization bits\n");
+ }
+ break;
+ }
+ default:
+ printf("unsupported quantization bits\n");
+ break;
+ }
+
+ fclose(fp);
+ return true;
+ }
+
+ int num_channel() const { return num_channel_; }
+ int sample_rate() const { return sample_rate_; }
+ int bits_per_sample() const { return bits_per_sample_; }
+ int num_samples() const { return num_samples_; }
+
+ ~WavReader() {
+ delete[] data_;
+ }
+
+ const float* data() const { return data_; }
+
+ private:
+ int num_channel_;
+ int sample_rate_;
+ int bits_per_sample_;
+ int num_samples_; // sample points per channel
+ float* data_;
+};
+
+class WavWriter {
+ public:
+ WavWriter(const float* data, int num_samples, int num_channel,
+ int sample_rate, int bits_per_sample)
+ : data_(data),
+ num_samples_(num_samples),
+ num_channel_(num_channel),
+ sample_rate_(sample_rate),
+ bits_per_sample_(bits_per_sample) {}
+
+ void Write(const std::string& filename) {
+ FILE* fp = fopen(filename.c_str(), "w");
+ // init char 'riff' 'WAVE' 'fmt ' 'data'
+ WavHeader header;
+ char wav_header[44] = {0x52, 0x49, 0x46, 0x46, 0x00, 0x00, 0x00, 0x00, 0x57,
+ 0x41, 0x56, 0x45, 0x66, 0x6d, 0x74, 0x20, 0x10, 0x00,
+ 0x00, 0x00, 0x01, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
+ 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
+ 0x64, 0x61, 0x74, 0x61, 0x00, 0x00, 0x00, 0x00};
+ memcpy(&header, wav_header, sizeof(header));
+ header.channels = num_channel_;
+ header.bit = bits_per_sample_;
+ header.sample_rate = sample_rate_;
+ header.data_size = num_samples_ * num_channel_ * (bits_per_sample_ / 8);
+ header.size = sizeof(header) - 8 + header.data_size;
+ header.bytes_per_second =
+ sample_rate_ * num_channel_ * (bits_per_sample_ / 8);
+ header.block_size = num_channel_ * (bits_per_sample_ / 8);
+
+ fwrite(&header, 1, sizeof(header), fp);
+
+ for (int i = 0; i < num_samples_; ++i) {
+ for (int j = 0; j < num_channel_; ++j) {
+ switch (bits_per_sample_) {
+ case 8: {
+ char sample = static_cast(data_[i * num_channel_ + j]);
+ fwrite(&sample, 1, sizeof(sample), fp);
+ break;
+ }
+ case 16: {
+ int16_t sample = static_cast(data_[i * num_channel_ + j]);
+ fwrite(&sample, 1, sizeof(sample), fp);
+ break;
+ }
+ case 32: {
+ int sample = static_cast(data_[i * num_channel_ + j]);
+ fwrite(&sample, 1, sizeof(sample), fp);
+ break;
+ }
+ }
+ }
+ }
+ fclose(fp);
+ }
+
+ private:
+ const float* data_;
+ int num_samples_; // total float points in data_
+ int num_channel_;
+ int sample_rate_;
+ int bits_per_sample_;
+};
+
+} // namespace wenet
+
+#endif // FRONTEND_WAV_H_
diff --git a/silero-vad/examples/csharp/Program.cs b/silero-vad/examples/csharp/Program.cs
new file mode 100644
index 0000000..13a27ca
--- /dev/null
+++ b/silero-vad/examples/csharp/Program.cs
@@ -0,0 +1,35 @@
+using System.Text;
+
+namespace VadDotNet;
+
+
+class Program
+{
+ private const string MODEL_PATH = "./resources/silero_vad.onnx";
+ private const string EXAMPLE_WAV_FILE = "./resources/example.wav";
+ private const int SAMPLE_RATE = 16000;
+ private const float THRESHOLD = 0.5f;
+ private const int MIN_SPEECH_DURATION_MS = 250;
+ private const float MAX_SPEECH_DURATION_SECONDS = float.PositiveInfinity;
+ private const int MIN_SILENCE_DURATION_MS = 100;
+ private const int SPEECH_PAD_MS = 30;
+
+ public static void Main(string[] args)
+ {
+
+ var vadDetector = new SileroVadDetector(MODEL_PATH, THRESHOLD, SAMPLE_RATE,
+ MIN_SPEECH_DURATION_MS, MAX_SPEECH_DURATION_SECONDS, MIN_SILENCE_DURATION_MS, SPEECH_PAD_MS);
+ List speechTimeList = vadDetector.GetSpeechSegmentList(new FileInfo(EXAMPLE_WAV_FILE));
+ //Console.WriteLine(speechTimeList.ToJson());
+ StringBuilder sb = new();
+ foreach (var speechSegment in speechTimeList)
+ {
+ sb.Append($"start second: {speechSegment.StartSecond}, end second: {speechSegment.EndSecond}\n");
+
+ }
+ Console.WriteLine(sb.ToString());
+
+ }
+
+
+}
diff --git a/silero-vad/examples/csharp/SileroSpeechSegment.cs b/silero-vad/examples/csharp/SileroSpeechSegment.cs
new file mode 100644
index 0000000..f9d85be
--- /dev/null
+++ b/silero-vad/examples/csharp/SileroSpeechSegment.cs
@@ -0,0 +1,21 @@
+namespace VadDotNet;
+
+public class SileroSpeechSegment
+{
+ public int? StartOffset { get; set; }
+ public int? EndOffset { get; set; }
+ public float? StartSecond { get; set; }
+ public float? EndSecond { get; set; }
+
+ public SileroSpeechSegment()
+ {
+ }
+
+ public SileroSpeechSegment(int startOffset, int? endOffset, float? startSecond, float? endSecond)
+ {
+ StartOffset = startOffset;
+ EndOffset = endOffset;
+ StartSecond = startSecond;
+ EndSecond = endSecond;
+ }
+}
\ No newline at end of file
diff --git a/silero-vad/examples/csharp/SileroVadDetector.cs b/silero-vad/examples/csharp/SileroVadDetector.cs
new file mode 100644
index 0000000..28ec56d
--- /dev/null
+++ b/silero-vad/examples/csharp/SileroVadDetector.cs
@@ -0,0 +1,249 @@
+using NAudio.Wave;
+using VADdotnet;
+
+namespace VadDotNet;
+
+public class SileroVadDetector
+{
+ private readonly SileroVadOnnxModel _model;
+ private readonly float _threshold;
+ private readonly float _negThreshold;
+ private readonly int _samplingRate;
+ private readonly int _windowSizeSample;
+ private readonly float _minSpeechSamples;
+ private readonly float _speechPadSamples;
+ private readonly float _maxSpeechSamples;
+ private readonly float _minSilenceSamples;
+ private readonly float _minSilenceSamplesAtMaxSpeech;
+ private int _audioLengthSamples;
+ private const float THRESHOLD_GAP = 0.15f;
+ // ReSharper disable once InconsistentNaming
+ private const int SAMPLING_RATE_8K = 8000;
+ // ReSharper disable once InconsistentNaming
+ private const int SAMPLING_RATE_16K = 16000;
+
+ public SileroVadDetector(string onnxModelPath, float threshold, int samplingRate,
+ int minSpeechDurationMs, float maxSpeechDurationSeconds,
+ int minSilenceDurationMs, int speechPadMs)
+ {
+ if (samplingRate != SAMPLING_RATE_8K && samplingRate != SAMPLING_RATE_16K)
+ {
+ throw new ArgumentException("Sampling rate not support, only available for [8000, 16000]");
+ }
+
+ this._model = new SileroVadOnnxModel(onnxModelPath);
+ this._samplingRate = samplingRate;
+ this._threshold = threshold;
+ this._negThreshold = threshold - THRESHOLD_GAP;
+ this._windowSizeSample = samplingRate == SAMPLING_RATE_16K ? 512 : 256;
+ this._minSpeechSamples = samplingRate * minSpeechDurationMs / 1000f;
+ this._speechPadSamples = samplingRate * speechPadMs / 1000f;
+ this._maxSpeechSamples = samplingRate * maxSpeechDurationSeconds - _windowSizeSample - 2 * _speechPadSamples;
+ this._minSilenceSamples = samplingRate * minSilenceDurationMs / 1000f;
+ this._minSilenceSamplesAtMaxSpeech = samplingRate * 98 / 1000f;
+ this.Reset();
+ }
+
+ public void Reset()
+ {
+ _model.ResetStates();
+ }
+
+ public List GetSpeechSegmentList(FileInfo wavFile)
+ {
+ Reset();
+
+ using var audioFile = new AudioFileReader(wavFile.FullName);
+ List speechProbList = [];
+ this._audioLengthSamples = (int)(audioFile.Length / 2);
+ float[] buffer = new float[this._windowSizeSample];
+
+ while (audioFile.Read(buffer, 0, buffer.Length) > 0)
+ {
+ float speechProb = _model.Call([buffer], _samplingRate)[0];
+ speechProbList.Add(speechProb);
+ }
+
+ return CalculateProb(speechProbList);
+ }
+
+ private List CalculateProb(List speechProbList)
+ {
+ List result = [];
+ bool triggered = false;
+ int tempEnd = 0, prevEnd = 0, nextStart = 0;
+ SileroSpeechSegment segment = new();
+
+ for (int i = 0; i < speechProbList.Count; i++)
+ {
+ float speechProb = speechProbList[i];
+ if (speechProb >= _threshold && (tempEnd != 0))
+ {
+ tempEnd = 0;
+ if (nextStart < prevEnd)
+ {
+ nextStart = _windowSizeSample * i;
+ }
+ }
+
+ if (speechProb >= _threshold && !triggered)
+ {
+ triggered = true;
+ segment.StartOffset = _windowSizeSample * i;
+ continue;
+ }
+
+ if (triggered && (_windowSizeSample * i) - segment.StartOffset > _maxSpeechSamples)
+ {
+ if (prevEnd != 0)
+ {
+ segment.EndOffset = prevEnd;
+ result.Add(segment);
+ segment = new SileroSpeechSegment();
+ if (nextStart < prevEnd)
+ {
+ triggered = false;
+ }
+ else
+ {
+ segment.StartOffset = nextStart;
+ }
+
+ prevEnd = 0;
+ nextStart = 0;
+ tempEnd = 0;
+ }
+ else
+ {
+ segment.EndOffset = _windowSizeSample * i;
+ result.Add(segment);
+ segment = new SileroSpeechSegment();
+ prevEnd = 0;
+ nextStart = 0;
+ tempEnd = 0;
+ triggered = false;
+ continue;
+ }
+ }
+
+ if (speechProb < _negThreshold && triggered)
+ {
+ if (tempEnd == 0)
+ {
+ tempEnd = _windowSizeSample * i;
+ }
+
+ if (((_windowSizeSample * i) - tempEnd) > _minSilenceSamplesAtMaxSpeech)
+ {
+ prevEnd = tempEnd;
+ }
+
+ if ((_windowSizeSample * i) - tempEnd < _minSilenceSamples)
+ {
+ continue;
+ }
+ else
+ {
+ segment.EndOffset = tempEnd;
+ if ((segment.EndOffset - segment.StartOffset) > _minSpeechSamples)
+ {
+ result.Add(segment);
+ }
+
+ segment = new SileroSpeechSegment();
+ prevEnd = 0;
+ nextStart = 0;
+ tempEnd = 0;
+ triggered = false;
+ continue;
+ }
+ }
+ }
+
+ if (segment.StartOffset != null && (_audioLengthSamples - segment.StartOffset) > _minSpeechSamples)
+ {
+ //segment.EndOffset = _audioLengthSamples;
+ segment.EndOffset = speechProbList.Count * _windowSizeSample;
+ result.Add(segment);
+ }
+
+ for (int i = 0; i < result.Count; i++)
+ {
+ SileroSpeechSegment item = result[i];
+ if (i == 0)
+ {
+ item.StartOffset = (int)Math.Max(0, item.StartOffset.Value - _speechPadSamples);
+ }
+
+ if (i != result.Count - 1)
+ {
+ SileroSpeechSegment nextItem = result[i + 1];
+ int silenceDuration = nextItem.StartOffset.Value - item.EndOffset.Value;
+ if (silenceDuration < 2 * _speechPadSamples)
+ {
+ item.EndOffset += (silenceDuration / 2);
+ nextItem.StartOffset = Math.Max(0, nextItem.StartOffset.Value - (silenceDuration / 2));
+ }
+ else
+ {
+ item.EndOffset = (int)Math.Min(_audioLengthSamples, item.EndOffset.Value + _speechPadSamples);
+ nextItem.StartOffset = (int)Math.Max(0, nextItem.StartOffset.Value - _speechPadSamples);
+ }
+ }
+ else
+ {
+ item.EndOffset = (int)Math.Min(_audioLengthSamples, item.EndOffset.Value + _speechPadSamples);
+ }
+ }
+
+ return MergeListAndCalculateSecond(result, _samplingRate);
+ }
+
+ private static List MergeListAndCalculateSecond(List original, int samplingRate)
+ {
+ List result = [];
+ if (original == null || original.Count == 0)
+ {
+ return result;
+ }
+
+ int left = original[0].StartOffset.Value;
+ int right = original[0].EndOffset.Value;
+ if (original.Count > 1)
+ {
+ original.Sort((a, b) => a.StartOffset.Value.CompareTo(b.StartOffset.Value));
+ for (int i = 1; i < original.Count; i++)
+ {
+ SileroSpeechSegment segment = original[i];
+
+ if (segment.StartOffset > right)
+ {
+ result.Add(new SileroSpeechSegment(left, right,
+ CalculateSecondByOffset(left, samplingRate), CalculateSecondByOffset(right, samplingRate)));
+ left = segment.StartOffset.Value;
+ right = segment.EndOffset.Value;
+ }
+ else
+ {
+ right = Math.Max(right, segment.EndOffset.Value);
+ }
+ }
+
+ result.Add(new SileroSpeechSegment(left, right,
+ CalculateSecondByOffset(left, samplingRate), CalculateSecondByOffset(right, samplingRate)));
+ }
+ else
+ {
+ result.Add(new SileroSpeechSegment(left, right,
+ CalculateSecondByOffset(left, samplingRate), CalculateSecondByOffset(right, samplingRate)));
+ }
+
+ return result;
+ }
+
+ private static float CalculateSecondByOffset(int offset, int samplingRate)
+ {
+ float secondValue = offset * 1.0f / samplingRate;
+ return (float)Math.Floor(secondValue * 1000.0f) / 1000.0f;
+ }
+}
\ No newline at end of file
diff --git a/silero-vad/examples/csharp/SileroVadOnnxModel.cs b/silero-vad/examples/csharp/SileroVadOnnxModel.cs
new file mode 100644
index 0000000..b57d464
--- /dev/null
+++ b/silero-vad/examples/csharp/SileroVadOnnxModel.cs
@@ -0,0 +1,215 @@
+using Microsoft.ML.OnnxRuntime;
+using Microsoft.ML.OnnxRuntime.Tensors;
+
+using System;
+using System.Collections.Generic;
+using System.Linq;
+
+namespace VADdotnet;
+
+
+public class SileroVadOnnxModel : IDisposable
+{
+ private readonly InferenceSession session;
+ private float[][][] state;
+ private float[][] context;
+ private int lastSr = 0;
+ private int lastBatchSize = 0;
+ private static readonly List SAMPLE_RATES = [8000, 16000];
+
+ public SileroVadOnnxModel(string modelPath)
+ {
+ var sessionOptions = new SessionOptions
+ {
+ InterOpNumThreads = 1,
+ IntraOpNumThreads = 1,
+ EnableCpuMemArena = true
+ };
+
+ session = new InferenceSession(modelPath, sessionOptions);
+ ResetStates();
+ }
+
+ public void ResetStates()
+ {
+ state = new float[2][][];
+ state[0] = new float[1][];
+ state[1] = new float[1][];
+ state[0][0] = new float[128];
+ state[1][0] = new float[128];
+ context = [];
+ lastSr = 0;
+ lastBatchSize = 0;
+ }
+
+ public void Dispose()
+ {
+ GC.SuppressFinalize(this);
+ }
+
+ public class ValidationResult(float[][] x, int sr)
+ {
+ public float[][] X { get; } = x;
+ public int Sr { get; } = sr;
+ }
+
+ private static ValidationResult ValidateInput(float[][] x, int sr)
+ {
+ if (x.Length == 1)
+ {
+ x = [x[0]];
+ }
+ if (x.Length > 2)
+ {
+ throw new ArgumentException($"Incorrect audio data dimension: {x[0].Length}");
+ }
+
+ if (sr != 16000 && (sr % 16000 == 0))
+ {
+ int step = sr / 16000;
+ float[][] reducedX = new float[x.Length][];
+
+ for (int i = 0; i < x.Length; i++)
+ {
+ float[] current = x[i];
+ float[] newArr = new float[(current.Length + step - 1) / step];
+
+ for (int j = 0, index = 0; j < current.Length; j += step, index++)
+ {
+ newArr[index] = current[j];
+ }
+
+ reducedX[i] = newArr;
+ }
+
+ x = reducedX;
+ sr = 16000;
+ }
+
+ if (!SAMPLE_RATES.Contains(sr))
+ {
+ throw new ArgumentException($"Only supports sample rates {string.Join(", ", SAMPLE_RATES)} (or multiples of 16000)");
+ }
+
+ if (((float)sr) / x[0].Length > 31.25)
+ {
+ throw new ArgumentException("Input audio is too short");
+ }
+
+ return new ValidationResult(x, sr);
+ }
+
+ private static float[][] Concatenate(float[][] a, float[][] b)
+ {
+ if (a.Length != b.Length)
+ {
+ throw new ArgumentException("The number of rows in both arrays must be the same.");
+ }
+
+ int rows = a.Length;
+ int colsA = a[0].Length;
+ int colsB = b[0].Length;
+ float[][] result = new float[rows][];
+
+ for (int i = 0; i < rows; i++)
+ {
+ result[i] = new float[colsA + colsB];
+ Array.Copy(a[i], 0, result[i], 0, colsA);
+ Array.Copy(b[i], 0, result[i], colsA, colsB);
+ }
+
+ return result;
+ }
+
+ private static float[][] GetLastColumns(float[][] array, int contextSize)
+ {
+ int rows = array.Length;
+ int cols = array[0].Length;
+
+ if (contextSize > cols)
+ {
+ throw new ArgumentException("contextSize cannot be greater than the number of columns in the array.");
+ }
+
+ float[][] result = new float[rows][];
+
+ for (int i = 0; i < rows; i++)
+ {
+ result[i] = new float[contextSize];
+ Array.Copy(array[i], cols - contextSize, result[i], 0, contextSize);
+ }
+
+ return result;
+ }
+
+ public float[] Call(float[][] x, int sr)
+ {
+ var result = ValidateInput(x, sr);
+ x = result.X;
+ sr = result.Sr;
+ int numberSamples = sr == 16000 ? 512 : 256;
+
+ if (x[0].Length != numberSamples)
+ {
+ throw new ArgumentException($"Provided number of samples is {x[0].Length} (Supported values: 256 for 8000 sample rate, 512 for 16000)");
+ }
+
+ int batchSize = x.Length;
+ int contextSize = sr == 16000 ? 64 : 32;
+
+ if (lastBatchSize == 0)
+ {
+ ResetStates();
+ }
+ if (lastSr != 0 && lastSr != sr)
+ {
+ ResetStates();
+ }
+ if (lastBatchSize != 0 && lastBatchSize != batchSize)
+ {
+ ResetStates();
+ }
+
+ if (context.Length == 0)
+ {
+ context = new float[batchSize][];
+ for (int i = 0; i < batchSize; i++)
+ {
+ context[i] = new float[contextSize];
+ }
+ }
+
+ x = Concatenate(context, x);
+
+ var inputs = new List