PyTorch如何实现婴儿哭声检测和识别
婴儿哭声检测是一个重要的应用场景,可以用于智能家居、婴儿监护系统等。下面我将介绍如何使用PyTorch实现一个基于深度学习的婴儿哭声检测系统。
1. 数据准备
首先需要收集婴儿哭声和背景声音的音频数据集。可以使用公开数据集如:
• ESC-50 (包含婴儿哭声类别)
• UrbanSound8K
• 自定义录制的数据集
【python】
import os
import librosa
import numpy as np
import torch
from torch.utils.data import Dataset, DataLoader
from sklearn.model_selection import train_test_split
class CrySoundDataset(Dataset):
def __init__(self, file_paths, labels, sample_rate=16000, n_mels=64, n_fft=1024, hop_length=512):
self.file_paths = file_paths
self.labels = labels
self.sample_rate = sample_rate
self.n_mels = n_mels
self.n_fft = n_fft
self.hop_length = hop_length
def __len__(self):
return len(self.file_paths)
def __getitem__(self, idx):
# 加载音频文件
audio, sr = librosa.load(self.file_paths[idx], sr=self.sample_rate)
# 提取梅尔频谱特征
mel_spec = librosa.feature.melspectrogram(
y=audio, sr=sr, n_mels=self.n_mels,
n_fft=self.n_fft, hop_length=self.hop_length
)
# 转换为分贝单位
mel_spec = librosa.power_to_db(mel_spec, ref=np.max)
# 添加通道维度并归一化
mel_spec = np.expand_dims(mel_spec, axis=0)
mel_spec = (mel_spec - mel_spec.mean()) / mel_spec.std()
label = self.labels[idx]
return torch.FloatTensor(mel_spec), torch.LongTensor([label])
# 假设我们已经有文件路径和标签列表
# file_paths = [...] # 音频文件路径列表
# labels = [...] # 对应的标签 (0: 非哭声, 1: 哭声)
# 划分训练集和测试集
# train_paths, test_paths, train_labels, test_labels = train_test_split(file_paths, labels, test_size=0.2, random_state=42)
# 创建数据集和数据加载器
# train_dataset = CrySoundDataset(train_paths, train_labels)
# test_dataset = CrySoundDataset(test_paths, test_labels)
# train_loader = DataLoader(train_dataset, batch_size=32, shuffle=True)
# test_loader = DataLoader(test_dataset, batch_size=32, shuffle=False)
2. 模型构建
我们将使用一个简单的2D CNN模型来处理梅尔频谱图:
【python】
import torch.nn as nn
import torch.nn.functional as F
class CrySoundDetector(nn.Module):
def __init__(self, num_classes=2):
super(CrySoundDetector, self).__init__()
self.conv1 = nn.Conv2d(1, 32, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1))
self.bn1 = nn.BatchNorm2d(32)
self.pool1 = nn.MaxPool2d(kernel_size=(2, 2))
self.conv2 = nn.Conv2d(32, 64, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1))
self.bn2 = nn.BatchNorm2d(64)
self.pool2 = nn.MaxPool2d(kernel_size=(2, 2))
self.conv3 = nn.Conv2d(64, 128, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1))
self.bn3 = nn.BatchNorm2d(128)
self.pool3 = nn.MaxPool2d(kernel_size=(2, 2))
self.fc1 = nn.Linear(128 * 8 * 8, 256) # 假设输入梅尔频谱图大小为64x64
self.dropout = nn.Dropout(0.5)
self.fc2 = nn.Linear(256, num_classes)
def forward(self, x):
# 输入形状: (batch_size, 1, n_mels, time_steps)
x = F.relu(self.bn1(self.conv1(x)))
x = self.pool1(x)
x = F.relu(self.bn2(self.conv2(x)))
x = self.pool2(x)
x = F.relu(self.bn3(self.conv3(x)))
x = self.pool3(x)
# 展平
x = x.view(x.size(0), -1)
x = F.relu(self.fc1(x))
x = self.dropout(x)
x = self.fc2(x)
return x
3. 训练过程
【python】
import torch.optim as optim
from tqdm import tqdm
def train_model(model, train_loader, test_loader, num_epochs=20, learning_rate=0.001):
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
model = model.to(device)
criterion = nn.CrossEntropyLoss()
optimizer = optim.Adam(model.parameters(), lr=learning_rate)
scheduler = optim.lr_scheduler.ReduceLROnPlateau(optimizer, 'min', patience=2)
best_acc = 0.0
for epoch in range(num_epochs):
model.train()
running_loss = 0.0
correct = 0
total = 0
# 训练阶段
for inputs, labels in tqdm(train_loader, desc=f"Epoch {epoch+1}/{num_epochs}"):
inputs, labels = inputs.to(device), labels.to(device).squeeze()
optimizer.zero_grad()
outputs = model(inputs)
loss = criterion(outputs, labels)
loss.backward()
optimizer.step()
running_loss += loss.item()
_, predicted = torch.max(outputs.data, 1)
total += labels.size(0)
correct += (predicted == labels).sum().item()
train_loss = running_loss / len(train_loader)
train_acc = 100 * correct / total
# 验证阶段
model.eval()
val_loss = 0.0
val_correct = 0
val_total = 0
with torch.no_grad():
for inputs, labels in test_loader:
inputs, labels = inputs.to(device), labels.to(device).squeeze()
outputs = model(inputs)
loss = criterion(outputs, labels)
val_loss += loss.item()
_, predicted = torch.max(outputs.data, 1)
val_total += labels.size(0)
val_correct += (predicted == labels).sum().item()
val_loss = val_loss / len(test_loader)
val_acc = 100 * val_correct / val_total
# 更新学习率
scheduler.step(val_loss)
# 保存最佳模型
if val_acc > best_acc:
best_acc = val_acc
torch.save(model.state_dict(), 'best_cry_detector.pth')
print(f"Epoch {epoch+1}: Train Loss: {train_loss:.4f}, Train Acc: {train_acc:.2f}%, "
f"Val Loss: {val_loss:.4f}, Val Acc: {val_acc:.2f}%")
print(f"Training complete. Best validation accuracy: {best_acc:.2f}%")
# 初始化模型并训练
# model = CrySoundDetector(num_classes=2)
# train_model(model, train_loader, test_loader, num_epochs=20)
4. 实时检测实现
训练完成后,我们可以实现一个实时检测系统:
【python】
import pyaudio
import time
def real_time_detection(model_path='best_cry_detector.pth', chunk_duration=1.0, sample_rate=16000):
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
# 加载模型
model = CrySoundDetector(num_classes=2)
model.load_state_dict(torch.load(model_path, map_location=device))
model = model.to(device)
model.eval()
# 音频参数
n_mels = 64
n_fft = 1024
hop_length = 512
chunk_size = int(chunk_duration * sample_rate)
# 初始化PyAudio
p = pyaudio.PyAudio()
stream = p.open(format=pyaudio.paInt16,
channels=1,
rate=sample_rate,
input=True,
frames_per_buffer=chunk_size)
print("Starting real-time cry detection... (Press Ctrl+C to stop)")
try:
while True:
# 读取音频数据
data = stream.read(chunk_size, exception_on_overflow=False)
audio = np.frombuffer(data, dtype=np.int16) / 32768.0 # 归一化
# 提取梅尔频谱
mel_spec = librosa.feature.melspectrogram(
y=audio, sr=sample_rate, n_mels=n_mels,
n_fft=n_fft, hop_length=hop_length
)
mel_spec = librosa.power_to_db(mel_spec, ref=np.max)
# 调整大小以匹配模型输入
if mel_spec.shape[1] < 64: # 假设我们想要64个时间步
pad_width = 64 - mel_spec.shape[1]
mel_spec = np.pad(mel_spec, ((0, 0), (0, pad_width)), mode='constant')
else:
mel_spec = mel_spec[:, :64]
# 添加批次和通道维度
mel_spec = np.expand_dims(np.expand_dims(mel_spec, axis=0), axis=0)
mel_spec = torch.FloatTensor(mel_spec).to(device)
# 预测
with torch.no_grad():
output = model(mel_spec)
_, predicted = torch.max(output.data, 1)
prob = torch.softmax(output, dim=1)[0][1].item() # 哭声的概率
# 显示结果
if predicted.item() == 1:
print(f"\033[91mCry detected! Probability: {prob:.2f}\033[0m")
else:
print(f"No cry detected. Probability: {prob:.2f}")
time.sleep(0.1) # 短暂延迟
except KeyboardInterrupt:
print("Stopping detection...")
stream.stop_stream()
stream.close()
p.terminate()
# 运行实时检测
# real_time_detection()
5. 改进方向
1. 数据增强:添加背景噪声、时间拉伸、音高变换等增强技术
2. 更复杂的模型:使用ResNet、EfficientNet等预训练模型