-
Notifications
You must be signed in to change notification settings - Fork 0
/
vad_cut4asr.py
147 lines (124 loc) · 6.27 KB
/
vad_cut4asr.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
import os
import sys
from time import time
import librosa
import numpy as np
import soundfile as sf
import torch
from loguru import logger
from vad import CRNN_VAD_STREAM
def vad_process(wav_list, output_path, use_gpu):
sample_rate = 16000
device = 'cpu' # cpu is fast enough.
if torch.cuda.is_available() and use_gpu:
device = 'cuda'
logger.info(f"use {device} to cut long audios.")
vad_model = CRNN_VAD_STREAM(left_frames=4,
right_frames=4,
device=device)
save_cutted_wav = True
if not os.path.exists(output_path):
os.mkdir(output_path)
total_time = 0
for f_name in wav_list:
f_name=f_name.strip()
logger.info(f'processing {f_name}')
wav_name = f_name.strip().split('/')[-1].split('.')[0]
wave, sr = librosa.load(f_name)
if wave.ndim > 1:
wave = wave.mean(-1)
if sr != 24000:
wave = librosa.resample(wave, orig_sr=sr, target_sr=24000)
# clear GRU buffer at the begining of each wave processing
vad_model.clear_gru_buffer()
speech_segment = [] # 保存过滤掉噪音和静音的音频
speech_segment_num = 0
asr_segment = [] # 保存用于ASR的断句之后的音频
asr_segment_num = 0
wave_padded = wave
# # 下面两个pad在实际流式场景中不会去做,这里是为了验证前向计算的结果,按照和训练时特征提取保持一样的操作。
# # 这里多pad了mel_nfft + right_samples 个采样点,实际上就是模型的时延
# wave_padded = np.pad(wave, (vad_model.mel_nfft // 2), mode='reflect')
# # 提前给音频 pad 上后文,实际上不会输出这部分,只会输出中间部分
# wave_padded = np.pad(wave_padded, (0, vad_model.right_samples))
wav_padded_len = wave_padded.shape[-1]
start_at = time()
# 每次传入的采样点数需为200的整数倍,这里假定每次传入0.2s采样率为24k的音频,即4800个采样点
central_samples = 4800
for start in range(0, wav_padded_len, central_samples):
end = min(start + central_samples, wav_padded_len)
wave_chunk = wave_padded[start:end]
# wave left is less than buffer length
if wav_padded_len - start < len(vad_model.buf):
wave_chunk = wave_padded[start:]
start = wav_padded_len # to break the loop
# 计算语音概率,当前chunk状态,语音标签,ASR断句类型
wav_16k = librosa.resample(wave_chunk, orig_sr=24000, target_sr=16000)
# 只有VAD推理用的是16k音频,其他都使用24k音频
if start != wav_padded_len : # 不是 last chunk
vad_post_chunk, chunk_state, vad_pred_chunk, asr_endpoint = \
vad_model.stream_asr_endpoint(wav_16k,
max_speech_len=25,
max_trailing_silence=150,)
# # 下面保存的音频是去除了静音/噪音的音频,只剩下应该被激活的音频
# if chunk_state != 0: # 只要不是整段静音就拼起来
# speech_segment += wave_chunk.tolist()
# if chunk_state == 3: # SPEECH_END
# # 只要一段音频结束,就输出切分之后的 wav,以便检查和分析
# if output_path and save_cutted_wav:
# sf.write(os.path.join(
# output_path,
# f"{wav_name}_active_{speech_segment_num:05d}.wav"),
# np.array(speech_segment),
# samplerate=sample_rate)
# speech_segment_num += 1
# speech_segment = [] # clear the segment
# 下面演示用于ASR分句的VAD效果,保存的音频只是在流式输入的基础上根据需要断开的位置进行了断句
if asr_endpoint > 0 and len(asr_segment) >= 10*24000: # vad检测到endpoint
vad_model.reset_state() # 需要重置,以便进行下一次的检测
# 只要一段音频结束,就输出切分之后的 wav,以便检查和分析
if output_path and save_cutted_wav:
sf.write(os.path.join(
output_path,
f"{wav_name}_{asr_segment_num:05d}.wav"),
np.array(asr_segment),
samplerate=24000)
asr_segment_num += 1
asr_segment = [] # clear the segment
# 时延大概就是200ms的一个chunk, 因此当前chunk判断完,数据加入加一段里面正好。
asr_segment += wave_chunk.tolist()
if start == wav_padded_len:
break
if len(asr_segment)>0 and output_path and save_cutted_wav:
sf.write(os.path.join(
output_path,
f"{wav_name}_{asr_segment_num:05d}.wav"),
np.array(asr_segment),
samplerate=24000)
end_at = time()
logger.info(f"{f_name} {end_at - start_at}s")
total_time += (end_at - start_at)
logger.info(f"device: {device}, total_time: {total_time}s")
if __name__ == "__main__":
wav_list = sys.argv[1]
output_path = sys.argv[2]
use_gpu = int(sys.argv[3])
MAX_THREAD=1
f_wav = open(wav_list, 'r')
file_list = []
for i, line in enumerate(f_wav.readlines()):
file_list.append(line)
th_cnt = min(MAX_THREAD, len(file_list))
if th_cnt==1:
vad_process(file_list, output_path, use_gpu)
else:
import threading
total_cnt = len(file_list)
job_per_th = total_cnt // th_cnt if(total_cnt%th_cnt==0) else total_cnt // th_cnt + 1
threads = []
for th in range(th_cnt):
start_idx = th * job_per_th
end_idx = min((th+1) * job_per_th, total_cnt)
threads.append(threading.Thread(target=vad_process, args=(file_list[start_idx:end_idx], output_path, use_gpu)))
for th in range(th_cnt):
threads[th].start()