KM算法识别语音数字0-9
KM算法识别语音数字0-9
数据集
import numpy as np
from scipy.cluster.vq import kmeans
import librosa
import pdb
def extract_feature(audio_path):
y, sr = librosa.load(audio_path, sr = None)
mfcc = librosa.feature.mfcc(y=y, sr=sr, n_mfcc=13).T
return np.mean(mfcc, axis=0)
def train_kmeans_models(train_audio, num_clusters=3):
models = {}
for digit in range(10):
feature = []
for audio_path in train_audio[digit]:
feature.append(extract_feature(audio_path))
feature_array = np.array(feature)
centroids, _ = kmeans(feature_array, num_clusters)
models[digit] = centroids
return models
def recognize_digit(audio_path, models):
test_feature = extract_feature(audio_path)
min_dist = float('inf')
predicted_dight = - 1
for digit, centroids in models.items():
distance = np.linalg.norm(centroids - test_feature, axis=1)
current_min_dist = np.min(distance)
if current_min_dist < min_dist:
min_dist = current_min_dist
predicted_digit = digit
return predicted_digit
if __name__ == "__main__":
train_audio = {0:["D:\\workSpace\\AiLearnCode\\0-9DataSet\\b2012-main\\0-9\\0\\10.wav",
"D:\\workSpace\\AiLearnCode\\0-9DataSet\\b2012-main\\0-9\\0\\11.wav",
"D:\\workSpace\\AiLearnCode\\0-9DataSet\\b2012-main\\0-9\\0\\12.wav",
"D:\\workSpace\\AiLearnCode\\0-9DataSet\\b2012-main\\0-9\\0\\13.wav",
"D:\\workSpace\\AiLearnCode\\0-9DataSet\\b2012-main\\0-9\\0\\14.wav",
"D:\\workSpace\\AiLearnCode\\0-9DataSet\\b2012-main\\0-9\\0\\15.wav",],
1:["D:\\workSpace\\AiLearnCode\\0-9DataSet\\b2012-main\\0-9\\1\\10.wav",
"D:\\workSpace\\AiLearnCode\\0-9DataSet\\b2012-main\\0-9\\1\\11.wav",
"D:\\workSpace\\AiLearnCode\\0-9DataSet\\b2012-main\\0-9\\1\\12.wav",
"D:\\workSpace\\AiLearnCode\\0-9DataSet\\b2012-main\\0-9\\1\\13.wav",
"D:\\workSpace\\AiLearnCode\\0-9DataSet\\b2012-main\\0-9\\1\\14.wav",
"D:\\workSpace\\AiLearnCode\\0-9DataSet\\b2012-main\\0-9\\1\\15.wav",],
2:["D:\\workSpace\\AiLearnCode\\0-9DataSet\\b2012-main\\0-9\\2\\10.wav",
"D:\\workSpace\\AiLearnCode\\0-9DataSet\\b2012-main\\0-9\\2\\11.wav",
"D:\\workSpace\\AiLearnCode\\0-9DataSet\\b2012-main\\0-9\\2\\12.wav",
"D:\\workSpace\\AiLearnCode\\0-9DataSet\\b2012-main\\0-9\\2\\13.wav",
"D:\\workSpace\\AiLearnCode\\0-9DataSet\\b2012-main\\0-9\\2\\14.wav",
"D:\\workSpace\\AiLearnCode\\0-9DataSet\\b2012-main\\0-9\\2\\15.wav",],
3:["D:\\workSpace\\AiLearnCode\\0-9DataSet\\b2012-main\\0-9\\3\\10.wav",
"D:\\workSpace\\AiLearnCode\\0-9DataSet\\b2012-main\\0-9\\3\\11.wav",
"D:\\workSpace\\AiLearnCode\\0-9DataSet\\b2012-main\\0-9\\3\\12.wav",
"D:\\workSpace\\AiLearnCode\\0-9DataSet\\b2012-main\\0-9\\3\\13.wav",
"D:\\workSpace\\AiLearnCode\\0-9DataSet\\b2012-main\\0-9\\3\\14.wav",
"D:\\workSpace\\AiLearnCode\\0-9DataSet\\b2012-main\\0-9\\3\\15.wav",],
4:["D:\\workSpace\\AiLearnCode\\0-9DataSet\\b2012-main\\0-9\\4\\10.wav",
"D:\\workSpace\\AiLearnCode\\0-9DataSet\\b2012-main\\0-9\\4\\11.wav",
"D:\\workSpace\\AiLearnCode\\0-9DataSet\\b2012-main\\0-9\\4\\12.wav",
"D:\\workSpace\\AiLearnCode\\0-9DataSet\\b2012-main\\0-9\\4\\13.wav",
"D:\\workSpace\\AiLearnCode\\0-9DataSet\\b2012-main\\0-9\\4\\14.wav",
"D:\\workSpace\\AiLearnCode\\0-9DataSet\\b2012-main\\0-9\\4\\15.wav",],
5:["D:\\workSpace\\AiLearnCode\\0-9DataSet\\b2012-main\\0-9\\5\\10.wav",
"D:\\workSpace\\AiLearnCode\\0-9DataSet\\b2012-main\\0-9\\5\\11.wav",
"D:\\workSpace\\AiLearnCode\\0-9DataSet\\b2012-main\\0-9\\5\\12.wav",
"D:\\workSpace\\AiLearnCode\\0-9DataSet\\b2012-main\\0-9\\5\\13.wav",
"D:\\workSpace\\AiLearnCode\\0-9DataSet\\b2012-main\\0-9\\5\\14.wav",
"D:\\workSpace\\AiLearnCode\\0-9DataSet\\b2012-main\\0-9\\5\\15.wav",],
6:["D:\\workSpace\\AiLearnCode\\0-9DataSet\\b2012-main\\0-9\\6\\10.wav",
"D:\\workSpace\\AiLearnCode\\0-9DataSet\\b2012-main\\0-9\\6\\11.wav",
"D:\\workSpace\\AiLearnCode\\0-9DataSet\\b2012-main\\0-9\\6\\12.wav",
"D:\\workSpace\\AiLearnCode\\0-9DataSet\\b2012-main\\0-9\\6\\13.wav",
"D:\\workSpace\\AiLearnCode\\0-9DataSet\\b2012-main\\0-9\\6\\14.wav",
"D:\\workSpace\\AiLearnCode\\0-9DataSet\\b2012-main\\0-9\\6\\15.wav",],
7:["D:\\workSpace\\AiLearnCode\\0-9DataSet\\b2012-main\\0-9\\7\\10.wav",
"D:\\workSpace\\AiLearnCode\\0-9DataSet\\b2012-main\\0-9\\7\\11.wav",
"D:\\workSpace\\AiLearnCode\\0-9DataSet\\b2012-main\\0-9\\7\\12.wav",
"D:\\workSpace\\AiLearnCode\\0-9DataSet\\b2012-main\\0-9\\7\\13.wav",
"D:\\workSpace\\AiLearnCode\\0-9DataSet\\b2012-main\\0-9\\7\\14.wav",
"D:\\workSpace\\AiLearnCode\\0-9DataSet\\b2012-main\\0-9\\7\\15.wav",],
8:["D:\\workSpace\\AiLearnCode\\0-9DataSet\\b2012-main\\0-9\\8\\10.wav",
"D:\\workSpace\\AiLearnCode\\0-9DataSet\\b2012-main\\0-9\\8\\11.wav",
"D:\\workSpace\\AiLearnCode\\0-9DataSet\\b2012-main\\0-9\\8\\12.wav",
"D:\\workSpace\\AiLearnCode\\0-9DataSet\\b2012-main\\0-9\\8\\13.wav",
"D:\\workSpace\\AiLearnCode\\0-9DataSet\\b2012-main\\0-9\\8\\14.wav",
"D:\\workSpace\\AiLearnCode\\0-9DataSet\\b2012-main\\0-9\\8\\15.wav",],
9:["D:\\workSpace\\AiLearnCode\\0-9DataSet\\b2012-main\\0-9\\9\\10.wav",
"D:\\workSpace\\AiLearnCode\\0-9DataSet\\b2012-main\\0-9\\0\\11.wav",
"D:\\workSpace\\AiLearnCode\\0-9DataSet\\b2012-main\\0-9\\9\\12.wav",
"D:\\workSpace\\AiLearnCode\\0-9DataSet\\b2012-main\\0-9\\9\\13.wav",
"D:\\workSpace\\AiLearnCode\\0-9DataSet\\b2012-main\\0-9\\9\\14.wav",
"D:\\workSpace\\AiLearnCode\\0-9DataSet\\b2012-main\\0-9\\9\\15.wav",]}
# 训练模型
# pdb.set_trace()
models = train_kmeans_models(train_audio, num_clusters=3)
# 测试识别
test_audio = "D:\\workSpace\\AiLearnCode\\0-9DataSet\\b2012-main\\0-9\\1\\22.wav"
recognized = recognize_digit(test_audio, models)
print(f"识别结果为: {recognized}")