visual c++ 创建Win32工程调用windows API 做语音识别。查找了很多资料,但是很少用Win32的,国外倒是有很多人用C#调用Windows API 做语音识别。很多结合语音识别与word结合,把识别的语音写在word文档上,由于我不是C#喜好者,所以没有查找实现那些的source code。
代码如下:
// SpeechToTextTest2.cpp : Defines the entry point for the application.
#include "stdafx.h"
#include "SpeechToTextTest2.h"
#include<Windows.h>
#include<WinUser.h>
#include <sphelper.h>
#include <string>
//#include "Resource.h"
#define WM_RECOEVENT WM_USER+1
#define ID_START_RECOG 13
BOOL CALLBACK DlgProc(HWND hWnd, UINT Message, WPARAM wParam, LPARAM lParam);
void LaunchRecognition(HWND hWnd);
void HandleEvent(HWND hWnd);
WCHAR *ExtractInput(CSpEvent event);
void CleanupSAPI();
CComPtr<ISpRecognizer> g_cpEngine;
CComPtr<ISpRecoContext> g_cpRecoCtx;
CComPtr<ISpRecoGrammar> g_cpRecoGrammar;
WCHAR *lpszBuffer;
int WINAPI WinMain(HINSTANCE hInstance, HINSTANCE hPrevInstance, LPSTR lpCmdLine, int nShowCmd)
// allocating memory for buffer this buffer is used to store
// the text during the speech recognition process
lpszBuffer = new WCHAR[MAX_PATH];
lpszBuffer[0] = 0;
DialogBox(hInstance, MAKEINTRESOURCE(IDD_DIALOG12), NULL, DlgProc);
// freeing the memory that was allocated for the buffer
delete[] lpszBuffer;
return 0;
BOOL CALLBACK DlgProc(HWND hWnd, UINT Message, WPARAM wParam, LPARAM lParam)
switch (Message)
case WM_RECOEVENT:
HandleEvent(hWnd);
break;
case WM_COMMAND:
switch (LOWORD(wParam))
case IDC_BUTTON1:
LaunchRecognition(hWnd);
break;
break;
case WM_CLOSE:
CleanupSAPI();
EndDialog(hWnd, 0);
break;
default:
return FALSE;
return TRUE;
void LaunchRecognition(HWND hWnd)
if (FAILED(::CoInitialize(NULL)))
throw std::string("Unable to initialise COM objects");
ULONGLONG ullGramId = 1;
HRESULT hr = g_cpEngine.CoCreateInstance(CLSID_SpSharedRecognizer);
if (FAILED(hr))
throw std::string("Unable to create recognition engine");
hr = g_cpEngine->CreateRecoContext(&g_cpRecoCtx);
if (FAILED(hr))
throw std::string("Failed command recognition");
hr = g_cpRecoCtx->SetNotifyWindowMessage(hWnd, WM_RECOEVENT, 0, 0);
if (FAILED(hr))
throw std::string("Unable to select notification window");
const ULONGLONG ullInterest = SPFEI(SPEI_SOUND_START) | SPFEI(SPEI_SOUND_END) |
SPFEI(SPEI_PHRASE_START) | SPFEI(SPEI_RECOGNITION) |
SPFEI(SPEI_FALSE_RECOGNITION) | SPFEI(SPEI_HYPOTHESIS) |
SPFEI(SPEI_INTERFERENCE) | SPFEI(SPEI_RECO_OTHER_CONTEXT) |
SPFEI(SPEI_REQUEST_UI) | SPFEI(SPEI_RECO_STATE_CHANGE) |
SPFEI(SPEI_PROPERTY_NUM_CHANGE) | SPFEI(SPEI_PROPERTY_STRING_CHANGE);
hr = g_cpRecoCtx->SetInterest(ullInterest, ullInterest);
if (FAILED(hr))
throw std::string("Failed to create interest");
hr = g_cpRecoCtx->CreateGrammar(ullGramId, &g_cpRecoGrammar);
if (FAILED(hr))
throw std::string("Unable to create grammar");
hr = g_cpRecoGrammar->LoadDictation(0, SPLO_STATIC);
if (FAILED(hr))
throw std::string("Failed to load dictation");
hr = g_cpRecoGrammar->SetDictationState(SPRS_ACTIVE);
if (FAILED(hr))
throw std::string("Failed setting dictation state");
void HandleEvent(HWND hWnd)
CSpEvent event;
WCHAR *pwszText;
// Loop processing events while there are any in the queue
while (event.GetFrom(g_cpRecoCtx) == S_OK)
switch (event.eEventId)
case SPEI_HYPOTHESIS:
pwszText = ExtractInput(event);
MessageBoxW(NULL, pwszText, L"text", MB_ICONERROR);
wcscat(lpszBuffer, pwszText);
wcsncat(lpszBuffer, L"\r\n", 2);
SetDlgItemTextW(hWnd, IDC_EDIT1, lpszBuffer);
break;
WCHAR *ExtractInput(CSpEvent event)
HRESULT hr = S_OK;
CComPtr<ISpRecoResult> cpRecoResult;
SPPHRASE *pPhrase;
WCHAR *pwszText;
cpRecoResult = event.RecoResult();
hr = cpRecoResult->GetPhrase(&pPhrase);
if (SUCCEEDED(hr))
if (event.eEventId == SPEI_FALSE_RECOGNITION)
pwszText = L"False recognition";
//MessageBoxW(NULL, pwszText, L"text", MB_ICONERROR);
// Get the phrase's entire text string, including replacements.
hr = cpRecoResult->GetText(SP_GETWHOLEPHRASE, SP_GETWHOLEPHRASE, TRUE, &pwszText, NULL);
CoTaskMemFree(pPhrase);
return pwszText;
void CleanupSAPI()
if (g_cpRecoGrammar)
g_cpRecoGrammar.Release();
if (g_cpRecoCtx)
g_cpRecoCtx->SetNotifySink(NULL);
g_cpRecoCtx.Release();
if (g_cpEngine)
g_cpEngine.Release();
CoUninitialize();
运行结果如下:
C++ Speech Recognition:https://www.codeproject.com/Tips/784140/Cplusplus-Speech-Recognition
点进去看这个文章后,你会发现有一堆问都文件问题的人,无法编译通过,因为源工程已经不存在了,.cpp而没有头文件部分。经过一番摸索,我实现了如上。
博客里面也只有.cpp部分,由于csdn下载文件要积分。所以就不上传了源工程了。
如果觉得有参考价值、需要源文件的同志,可以email我,邮箱:761577651@qq.com
也可以在下方留言:留言比较快!向那些迟发的邮件致歉!
不缺积分的可以点击链接下载:https://download.csdn.net/download/thecentry/10859369
还要提醒的是,我只是实现了,源博客APP的实现问题。
visual c++ 创建Win32工程调用windows API 做语音识别。查找了很多资料,但是很少用Win32的,国外倒是有很多人用C#调用Windows API 做语音识别。很多结合语音识别与word结合,把识别的语音写在word文档上,由于我不是C#喜好者,所以没有查找实现那些的source code。代码如下:// SpeechToTextTest2.cpp : Define...
语音特征
MFCC:Mel Frequency Cepstral Ceofficient,目前最主流的语音信号特征提取方式,相比ceptrum的流程,主要是增加了mel滤波,另外用DCT替换了IFFT。
Fbank:亦称MFSC(log mel-frequency spectral Coefficients),特征的提取方法就是相当于MFCC去掉最后一步的离散余弦变换,跟MFCC特征相比,Fbank特征保留了更多的原始语音数据。
声学模型: