Files
obs-studio/plugins/frontend-tools/captions-mssapi.cpp
PatTheMav 1829492e6b cmake: Move frontend plugins into main plugins dir
Frontend plugins should not require being placed in the frontend
directory to be built successfully. Indeed they should only depend
on libobs and the obs-frontend-api and thus their source tree should
be able to exist anywhere (even standalone) and the plugin should still
compile successfully (just like any 3rd party plugin).

Thus moving those plugins into the main plugin directory ensures that
they don't require on any "special sauce" within the source tree to
compile.
2026-06-05 16:00:18 -04:00

168 lines
4.2 KiB
C++

#include "captions-mssapi.hpp"
#define do_log(type, format, ...) blog(type, "[Captions] " format, ##__VA_ARGS__)
#define error(format, ...) do_log(LOG_ERROR, format, ##__VA_ARGS__)
#define debug(format, ...) do_log(LOG_DEBUG, format, ##__VA_ARGS__)
mssapi_captions::mssapi_captions(captions_cb callback, const std::string &lang)
try : captions_handler(callback, AUDIO_FORMAT_16BIT, 16000) {
HRESULT hr;
std::wstring wlang;
wlang.resize(lang.size());
for (size_t i = 0; i < lang.size(); i++)
wlang[i] = (wchar_t)lang[i];
LCID lang_id = LocaleNameToLCID(wlang.c_str(), 0);
wchar_t lang_str[32];
_snwprintf(lang_str, 31, L"language=%x", (int)lang_id);
stop = CreateEvent(nullptr, false, false, nullptr);
if (!stop.Valid())
throw "Failed to create event";
hr = SpFindBestToken(SPCAT_RECOGNIZERS, lang_str, nullptr, &token);
if (FAILED(hr))
throw HRError("SpFindBestToken failed", hr);
hr = CoCreateInstance(CLSID_SpInprocRecognizer, nullptr, CLSCTX_ALL, __uuidof(ISpRecognizer),
(void **)&recognizer);
if (FAILED(hr))
throw HRError("CoCreateInstance for recognizer failed", hr);
hr = recognizer->SetRecognizer(token);
if (FAILED(hr))
throw HRError("SetRecognizer failed", hr);
hr = recognizer->SetRecoState(SPRST_INACTIVE);
if (FAILED(hr))
throw HRError("SetRecoState(SPRST_INACTIVE) failed", hr);
hr = recognizer->CreateRecoContext(&context);
if (FAILED(hr))
throw HRError("CreateRecoContext failed", hr);
ULONGLONG interest = SPFEI(SPEI_RECOGNITION) | SPFEI(SPEI_END_SR_STREAM);
hr = context->SetInterest(interest, interest);
if (FAILED(hr))
throw HRError("SetInterest failed", hr);
hr = context->SetNotifyWin32Event();
if (FAILED(hr))
throw HRError("SetNotifyWin32Event", hr);
notify = context->GetNotifyEventHandle();
if (notify == INVALID_HANDLE_VALUE)
throw HRError("GetNotifyEventHandle failed", E_NOINTERFACE);
size_t sample_rate = audio_output_get_sample_rate(obs_get_audio());
audio = new CaptionStream((DWORD)sample_rate, this);
audio->Release();
hr = recognizer->SetInput(audio, false);
if (FAILED(hr))
throw HRError("SetInput failed", hr);
hr = context->CreateGrammar(1, &grammar);
if (FAILED(hr))
throw HRError("CreateGrammar failed", hr);
hr = grammar->LoadDictation(nullptr, SPLO_STATIC);
if (FAILED(hr))
throw HRError("LoadDictation failed", hr);
try {
t = std::thread([this]() { main_thread(); });
} catch (...) {
throw "Failed to create thread";
}
} catch (const char *err) {
blog(LOG_WARNING, "%s: %s", __FUNCTION__, err);
throw CAPTIONS_ERROR_GENERIC_FAIL;
} catch (HRError err) {
blog(LOG_WARNING, "%s: %s (%lX)", __FUNCTION__, err.str, err.hr);
throw CAPTIONS_ERROR_GENERIC_FAIL;
}
mssapi_captions::~mssapi_captions()
{
if (t.joinable()) {
SetEvent(stop);
t.join();
}
}
void mssapi_captions::main_thread()
try {
HRESULT hr;
os_set_thread_name(__FUNCTION__);
hr = grammar->SetDictationState(SPRS_ACTIVE);
if (FAILED(hr))
throw HRError("SetDictationState failed", hr);
hr = recognizer->SetRecoState(SPRST_ACTIVE);
if (FAILED(hr))
throw HRError("SetRecoState(SPRST_ACTIVE) failed", hr);
HANDLE events[] = {notify, stop};
started = true;
for (;;) {
DWORD ret = WaitForMultipleObjects(2, events, false, INFINITE);
if (ret != WAIT_OBJECT_0)
break;
CSpEvent event;
bool exit = false;
while (event.GetFrom(context) == S_OK) {
if (event.eEventId == SPEI_RECOGNITION) {
ISpRecoResult *result = event.RecoResult();
CoTaskMemPtr<wchar_t> text;
hr = result->GetText((ULONG)-1, (ULONG)-1, true, &text, nullptr);
if (FAILED(hr))
continue;
char text_utf8[512];
os_wcs_to_utf8(text, 0, text_utf8, 512);
callback(text_utf8);
blog(LOG_DEBUG, "\"%s\"", text_utf8);
} else if (event.eEventId == SPEI_END_SR_STREAM) {
exit = true;
break;
}
}
if (exit)
break;
}
audio->Stop();
} catch (HRError err) {
blog(LOG_WARNING, "%s failed: %s (%lX)", __FUNCTION__, err.str, err.hr);
}
void mssapi_captions::pcm_data(const void *data, size_t frames)
{
if (started)
audio->PushAudio(data, frames);
}
captions_handler_info mssapi_info = {[]() -> std::string { return "Microsoft Speech-to-Text"; },
[](captions_cb cb, const std::string &lang) -> captions_handler * {
return new mssapi_captions(cb, lang);
}};