Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
1 change: 0 additions & 1 deletion src/azurespeechapi.cpp
Original file line number Diff line number Diff line change
@@ -1,5 +1,4 @@
#include "azurespeechapi.h"
#include <QDebug>

using namespace Microsoft::CognitiveServices::Speech;
using namespace Microsoft::CognitiveServices::Speech::Translation;
Expand Down
1 change: 0 additions & 1 deletion src/main.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -4,7 +4,6 @@
#include <QDateTime>
#include <windows.h>
#include <dbghelp.h>
#include <QDebug>
#include "mainwindow.h"
#include "logger.h"

Expand Down
201 changes: 183 additions & 18 deletions src/wasapiaudiocapture.cpp
Original file line number Diff line number Diff line change
@@ -1,5 +1,8 @@
#include "wasapiaudiocapture.h"
#include <QDebug>
#include "logger.h"
#include <comdef.h>
#include <vector>
#include <cmath>

WasapiAudioCapture::WasapiAudioCapture(QObject *parent)
: QObject(parent)
Expand All @@ -11,30 +14,42 @@ WasapiAudioCapture::WasapiAudioCapture(QObject *parent)
, m_isCapturing(false)
, m_waveFormat(nullptr)
, m_bufferFrameCount(0)
, logger(std::make_unique<Logger>())
{
LOG_INFO("WasapiAudioCapture 初始化");
}

WasapiAudioCapture::~WasapiAudioCapture()
{
LOG_INFO("WasapiAudioCapture 析构");
stopCapture();
cleanupWASAPI();
}

bool WasapiAudioCapture::initializeWASAPI()
{
LOG_INFO("开始初始化 WASAPI");
HRESULT hr = CoCreateInstance(__uuidof(MMDeviceEnumerator),
nullptr,
CLSCTX_ALL,
__uuidof(IMMDeviceEnumerator),
(void**)&m_deviceEnumerator);
if (FAILED(hr)) {
_com_error err(hr);
LOG_ERROR(QString("无法创建设备枚举器,错误代码: 0x%1, 描述: %2")
.arg(hr, 8, 16, QChar('0'))
.arg(QString::fromWCharArray(err.Description())));
emit error("无法创建设备枚举器");
return false;
}

// 获取默认音频输出设备
hr = m_deviceEnumerator->GetDefaultAudioEndpoint(eRender, eConsole, &m_audioDevice);
if (FAILED(hr)) {
_com_error err(hr);
LOG_ERROR(QString("无法获取默认音频输出设备,错误代码: 0x%1, 描述: %2")
.arg(hr, 8, 16, QChar('0'))
.arg(QString::fromWCharArray(err.Description())));
emit error("无法获取默认音频输出设备");
return false;
}
Expand All @@ -45,52 +60,117 @@ bool WasapiAudioCapture::initializeWASAPI()
nullptr,
(void**)&m_audioClient);
if (FAILED(hr)) {
_com_error err(hr);
LOG_ERROR(QString("无法激活音频客户端,错误代码: 0x%1, 描述: %2")
.arg(hr, 8, 16, QChar('0'))
.arg(QString::fromWCharArray(err.Description())));
emit error("无法激活音频客户端");
return false;
}

// 设置音频格式
m_waveFormat = new WAVEFORMATEX();
m_waveFormat->wFormatTag = WAVE_FORMAT_PCM;
m_waveFormat->nChannels = CHANNELS;
m_waveFormat->nSamplesPerSec = SAMPLE_RATE;
m_waveFormat->wBitsPerSample = BITS_PER_SAMPLE;
m_waveFormat->nBlockAlign = (m_waveFormat->nChannels * m_waveFormat->wBitsPerSample) / 8;
m_waveFormat->nAvgBytesPerSec = m_waveFormat->nSamplesPerSec * m_waveFormat->nBlockAlign;
m_waveFormat->cbSize = 0;
// 获取系统混音格式(可能是 WAVEFORMATEXTENSIBLE)
WAVEFORMATEX* pMixFormat = nullptr;
hr = m_audioClient->GetMixFormat(&pMixFormat);
if (FAILED(hr)) {
_com_error err(hr);
LOG_ERROR(QString("无法获取设备混音格式,错误代码: 0x%1, 描述: %2")
.arg(hr, 8, 16, QChar('0'))
.arg(QString::fromWCharArray(err.Description())));
emit error("无法获取设备混音格式");
return false;
}

// 检查是否支持 16kHz/16bit/单声道格式
WAVEFORMATEX desiredFormat = {0};
desiredFormat.wFormatTag = WAVE_FORMAT_PCM;
desiredFormat.nChannels = 1;
desiredFormat.nSamplesPerSec = 16000;
desiredFormat.wBitsPerSample = 16;
desiredFormat.nBlockAlign = desiredFormat.nChannels * desiredFormat.wBitsPerSample / 8;
desiredFormat.nAvgBytesPerSec = desiredFormat.nSamplesPerSec * desiredFormat.nBlockAlign;

BOOL supported = FALSE;
hr = m_audioClient->IsFormatSupported(AUDCLNT_SHAREMODE_SHARED, &desiredFormat, nullptr);
if (SUCCEEDED(hr)) {
LOG_INFO("设备支持 16kHz/16bit/单声道格式,将使用此格式");
// 分配足够的内存来存储 WAVEFORMATEXTENSIBLE
m_waveFormat = (WAVEFORMATEXTENSIBLE*)CoTaskMemAlloc(sizeof(WAVEFORMATEXTENSIBLE));
if (!m_waveFormat) {
LOG_ERROR("内存分配失败");
CoTaskMemFree(pMixFormat);
emit error("内存分配失败");
return false;
}
// 复制基本格式信息
memcpy(&m_waveFormat->Format, &desiredFormat, sizeof(WAVEFORMATEX));
// 设置扩展信息
m_waveFormat->Format.cbSize = sizeof(WAVEFORMATEXTENSIBLE) - sizeof(WAVEFORMATEX);
m_waveFormat->Samples.wValidBitsPerSample = desiredFormat.wBitsPerSample;
m_waveFormat->dwChannelMask = SPEAKER_FRONT_CENTER;
m_waveFormat->SubFormat = KSDATAFORMAT_SUBTYPE_PCM;
} else {
LOG_INFO(QString("设备不支持 16kHz/16bit/单声道格式,将使用系统混音格式:%1 通道, %2 Hz, %3 bit")
.arg(pMixFormat->nChannels)
.arg(pMixFormat->nSamplesPerSec)
.arg(pMixFormat->wBitsPerSample));
// 直接使用系统混音格式
m_waveFormat = (WAVEFORMATEXTENSIBLE*)pMixFormat;
pMixFormat = nullptr; // 防止在后面的 CoTaskMemFree 中重复释放
}

// 初始化音频客户端
// 使用选定的格式初始化
hr = m_audioClient->Initialize(AUDCLNT_SHAREMODE_SHARED,
AUDCLNT_STREAMFLAGS_LOOPBACK,
0,
0,
m_waveFormat,
0, // 默认缓冲区时长
0, // event-driven 模式
(WAVEFORMATEX*)m_waveFormat,
nullptr);

if (pMixFormat) {
CoTaskMemFree(pMixFormat); // 如果之前没有使用系统混音格式,释放它
}

if (FAILED(hr)) {
_com_error err(hr);
LOG_ERROR(QString("无法初始化音频客户端,错误代码: 0x%1, 描述: %2")
.arg(hr, 8, 16, QChar('0'))
.arg(QString::fromWCharArray(err.Description())));
emit error("无法初始化音频客户端");
return false;
}

// 获取缓冲区大小
hr = m_audioClient->GetBufferSize(&m_bufferFrameCount);
if (FAILED(hr)) {
_com_error err(hr);
LOG_ERROR(QString("无法获取缓冲区大小,错误代码: 0x%1, 描述: %2")
.arg(hr, 8, 16, QChar('0'))
.arg(QString::fromWCharArray(err.Description())));
emit error("无法获取缓冲区大小");
return false;
}

LOG_INFO(QString("音频缓冲区大小:%1 帧").arg(m_bufferFrameCount));

// 获取捕获客户端
hr = m_audioClient->GetService(__uuidof(IAudioCaptureClient),
(void**)&m_captureClient);
if (FAILED(hr)) {
_com_error err(hr);
LOG_ERROR(QString("无法获取捕获客户端,错误代码: 0x%1, 描述: %2")
.arg(hr, 8, 16, QChar('0'))
.arg(QString::fromWCharArray(err.Description())));
emit error("无法获取捕获客户端");
return false;
}

LOG_INFO("WASAPI 初始化完成");
return true;
}

void WasapiAudioCapture::cleanupWASAPI()
{
LOG_INFO("开始清理 WASAPI 资源");
if (m_captureClient) {
m_captureClient->Release();
m_captureClient = nullptr;
Expand All @@ -108,35 +188,42 @@ void WasapiAudioCapture::cleanupWASAPI()
m_deviceEnumerator = nullptr;
}
if (m_waveFormat) {
delete m_waveFormat;
CoTaskMemFree(m_waveFormat); // 使用 CoTaskMemFree 释放内存
m_waveFormat = nullptr;
}
LOG_INFO("WASAPI 资源清理完成");
}

bool WasapiAudioCapture::startCapture()
{
if (m_isCapturing) {
LOG_INFO("音频捕获已经在进行中");
return true;
}

LOG_INFO("开始音频捕获");
if (!initializeWASAPI()) {
LOG_ERROR("初始化 WASAPI 失败");
return false;
}

HRESULT hr = m_audioClient->Start();
if (FAILED(hr)) {
LOG_ERROR("无法启动音频捕获");
emit error("无法启动音频捕获");
return false;
}

m_isCapturing = true;
m_captureThread = CreateThread(nullptr, 0, captureThread, this, 0, nullptr);
if (!m_captureThread) {
LOG_ERROR("无法创建捕获线程");
emit error("无法创建捕获线程");
stopCapture();
return false;
}

LOG_INFO("音频捕获已启动");
return true;
}

Expand All @@ -146,6 +233,7 @@ void WasapiAudioCapture::stopCapture()
return;
}

LOG_INFO("停止音频捕获");
m_isCapturing = false;
if (m_captureThread) {
WaitForSingleObject(m_captureThread, INFINITE);
Expand All @@ -156,6 +244,7 @@ void WasapiAudioCapture::stopCapture()
if (m_audioClient) {
m_audioClient->Stop();
}
LOG_INFO("音频捕获已停止");
}

DWORD WINAPI WasapiAudioCapture::captureThread(LPVOID context)
Expand Down Expand Up @@ -205,7 +294,83 @@ void WasapiAudioCapture::processAudioData(const BYTE* data, UINT32 numFrames)
return;
}

UINT32 dataSize = numFrames * m_waveFormat->nBlockAlign;
QByteArray audioData(reinterpret_cast<const char*>(data), dataSize);
emit audioDataReceived(audioData);
// 将 BYTE* 转换为 float* 数组
auto floatBuf = reinterpret_cast<const float*>(data);

// 创建 16-bit PCM 缓冲区
std::vector<int16_t> pcm16;
pcm16.reserve(numFrames * m_waveFormat->Format.nChannels);

// 转换每个采样点
for (UINT32 i = 0; i < numFrames * m_waveFormat->Format.nChannels; ++i) {
float f = floatBuf[i];
// 裁剪到 [-1.0, 1.0] 范围
if (f > 1.f) f = 1.f;
if (f < -1.f) f = -1.f;
// 转换为 16-bit PCM
pcm16.push_back(static_cast<int16_t>(f * 32767));
}

// 如果当前不是 16kHz/16bit/单声道,需要进行转换
std::vector<int16_t> finalBuffer;
if (m_waveFormat->Format.nSamplesPerSec != 16000 || m_waveFormat->Format.nChannels != 1) {
// 先进行声道混音(如果需要)
std::vector<int16_t> monoBuffer;
if (m_waveFormat->Format.nChannels > 1) {
monoBuffer.reserve(numFrames);
for (UINT32 i = 0; i < numFrames; ++i) {
int32_t sum = 0;
for (UINT32 ch = 0; ch < m_waveFormat->Format.nChannels; ++ch) {
sum += pcm16[i * m_waveFormat->Format.nChannels + ch];
}
monoBuffer.push_back(static_cast<int16_t>(sum / m_waveFormat->Format.nChannels));
}
} else {
monoBuffer = std::move(pcm16);
}

// 然后进行重采样(如果需要)
if (m_waveFormat->Format.nSamplesPerSec != 16000) {
const double ratio = 16000.0 / m_waveFormat->Format.nSamplesPerSec;
const size_t newSize = static_cast<size_t>(std::ceil(monoBuffer.size() * ratio));
finalBuffer.reserve(newSize);

// 线性插值重采样
for (size_t i = 0; i < newSize; ++i) {
double pos = i / ratio;
size_t pos1 = static_cast<size_t>(std::floor(pos));
size_t pos2 = pos1 + 1;
double frac = pos - pos1;

// 边界检查
if (pos2 >= monoBuffer.size()) {
pos2 = monoBuffer.size() - 1;
}

// 线性插值
double sample = monoBuffer[pos1] * (1.0 - frac) + monoBuffer[pos2] * frac;
finalBuffer.push_back(static_cast<int16_t>(std::round(sample)));
}
} else {
finalBuffer = std::move(monoBuffer);
}
} else {
finalBuffer = std::move(pcm16);
}

// 创建输出数据
QByteArray out(reinterpret_cast<const char*>(finalBuffer.data()), finalBuffer.size() * sizeof(int16_t));

// 使用静态计数器来减少日志输出频率
static int logCounter = 0;
if (++logCounter % 100 == 0) { // 每100个数据块记录一次
LOG_INFO(QString("处理音频数据:输入 %1 帧,%2 通道,%3 Hz,输出 %4 帧,16kHz/16bit/单声道,输出大小:%5 字节")
.arg(numFrames)
.arg(m_waveFormat->Format.nChannels)
.arg(m_waveFormat->Format.nSamplesPerSec)
.arg(finalBuffer.size())
.arg(out.size()));
}

emit audioDataReceived(out);
}
5 changes: 4 additions & 1 deletion src/wasapiaudiocapture.h
Original file line number Diff line number Diff line change
Expand Up @@ -7,6 +7,8 @@
#include <mmdeviceapi.h>
#include <audioclient.h>
#include <functiondiscoverykeys_devpkey.h>
#include "logger.h"
#include <memory>

class WasapiAudioCapture : public QObject
{
Expand Down Expand Up @@ -35,8 +37,9 @@ class WasapiAudioCapture : public QObject
IAudioCaptureClient* m_captureClient;
HANDLE m_captureThread;
bool m_isCapturing;
WAVEFORMATEX* m_waveFormat;
WAVEFORMATEXTENSIBLE* m_waveFormat;
UINT32 m_bufferFrameCount;
std::unique_ptr<Logger> logger;
static const int SAMPLE_RATE = 16000;
static const int CHANNELS = 1;
static const int BITS_PER_SAMPLE = 16;
Expand Down