Kawe Mazidjatari b3a68ed095 Add EABase, EAThread and DirtySDK to R5sdk
DirtySDK (EA's Dirty Sockets library) will be used for the LiveAPI implementation, and depends on: EABase, EAThread.
2024-04-05 18:29:03 +02:00

1170 lines
44 KiB
C

/*H********************************************************************************/
/*!
\File voipnarrate.c
\Description
Voip narration API wrapping Cloud-based text-to-speech services, supporting
IBM Watson, Microsoft Speech Service, Google Speech, and Amazon Polly.
Narration requests may be up to 255 characters in length, and overlapping
requests are queued in order.
\Notes
References
IBM Watson:
Text-to Speech-API: https://www.ibm.com/watson/developercloud/text-to-speech/api/v1/curl.html
Microsoft Speech Service:
Text-to-Speech How-To: https://docs.microsoft.com/en-us/azure/cognitive-services/Speech-Service/how-to-text-to-speech
Google Text-to-Speech
Text-to-Speech API: https://cloud.google.com/text-to-speech/docs/reference/rest/
Amazon Polly
SynthesizeSpeech API: https://docs.aws.amazon.com/polly/latest/dg/API_SynthesizeSpeech.html
Amazon Endpoint Names: https://docs.aws.amazon.com/general/latest/gr/rande.html
VoiceId List: https://docs.aws.amazon.com/polly/latest/dg/API_SynthesizeSpeech.html#polly-SynthesizeSpeech-request-VoiceId
WAV:
WAVE file format: https://en.wikipedia.org/wiki/WAV#RIFF_WAVE
\Copyright
Copyright 2018 Electronic Arts
\Version 10/25/2018 (jbrookes) First Version
*/
/********************************************************************************H*/
/*** Include files ****************************************************************/
#include <string.h>
#include "DirtySDK/platform.h"
#include "DirtySDK/dirtysock.h"
#include "DirtySDK/dirtysock/dirtymem.h"
#include "DirtySDK/proto/protostream.h"
#include "DirtySDK/util/aws.h"
#include "DirtySDK/util/base64.h"
#include "DirtySDK/util/jsonformat.h"
#include "DirtySDK/util/jsonparse.h"
#include "DirtySDK/voip/voipdef.h"
#include "DirtySDK/voip/voipnarrate.h"
/*** Defines **********************************************************************/
//! protostream minimum data amount (for base64 decoding; four is the minimum amount but that produces one and a half samples, so we choose eight, BUT...
#define VOIPNARRATE_MINBUF (8)
//! how many ms of audio received should we treat as being empty audio (for metrics)
#define VOIPNARRATE_EMPTY_AUDIO_THRESHOLD_MS (300)
/*** Macros ***********************************************************************/
/*** Type Definitions *************************************************************/
typedef struct VoipNarrateConfigT
{
VoipNarrateProviderE eProvider; //!< configured provider
char strUrl[256]; //!< URL for text-to-speech request
char strKey[128]; //!< API key required for service authentication
} VoipNarrateConfigT;
//! narration request data
typedef struct VoipNarrateRequestT
{
struct VoipNarrateRequestT *pNext;
VoipNarrateGenderE eGender;
char strText[VOIPNARRATE_INPUT_MAX];
int8_t iUserIndex;
} VoipNarrateRequestT;
struct VoipNarrateRefT
{
int32_t iMemGroup;
void *pMemGroupUserData;
ProtoStreamRefT *pProtoStream; //!< stream transport module to handle buffered download of audio data
VoipNarrateVoiceDataCbT *pVoiceDataCb; //!< user callback used to provide voice data
void *pUserData; //!< user data for user callback
VoipNarrateRequestT *pRequest; //!< list of queued requests, if any
VoipNarrateConfigT Config; //!< module configuration (provider and credentials)
char strHead[256]; //!< http head for narration request
char strBody[512]; //!< http body for narration request
const char *pBody; //!< pointer to start of body (may not match buffer start)
VoipTextToSpeechMetricsT Metrics; //!< Usage metrics of the narration module
uint32_t uTtsStartTime; //!< time when we sent the request
uint8_t aVoiceBuf[160*3*2]; //!< base64 decode buffer, sized for one 30ms frame of 16khz 16bit voice audio, also a multiple of three bytes to accomodate base64 4->3 ratio
int32_t iVoiceOff; //!< read offset in buffered voice data
int32_t iVoiceLen; //!< end of buffered voice data
int32_t iSamplesInPhrase; //!< total number of samples received for this phrase
uint8_t bStart; //!< TRUE if start of stream download, else FALSE
uint8_t bActive; //!< TRUE if stream is active, else FALSE
int8_t iUserIndex; //!< index of local user current request is being made for
int8_t iVerbose; //!< verbose debug level (debug only)
};
/*** Variables ********************************************************************/
//! global config state
static VoipNarrateConfigT _VoipNarrate_Config = { VOIPNARRATE_PROVIDER_NONE, "", "" };
/*** Private Functions ************************************************************/
/*F********************************************************************************/
/*!
\Function _VoipNarrateCustomHeaderCb
\Description
Custom header callback used to sign AWS requests
\Input *pState - http module state
\Input *pHeader - pointer to http header buffer
\Input uHeaderSize - size of http header buffer
\Input *pData - pointer to data (unused)
\Input iDataLen - data length (unused)
\Input *pUserRef - voipnarrate ref
\Output
int32_t - output header length
\Version 12/28/2018 (jbrookes)
*/
/********************************************************************************F*/
static int32_t _VoipNarrateCustomHeaderCb(ProtoHttpRefT *pState, char *pHeader, uint32_t uHeaderSize, const char *pData, int64_t iDataLen, void *pUserRef)
{
VoipNarrateRefT *pVoipNarrate = (VoipNarrateRefT *)pUserRef;
int32_t iHdrLen = (int32_t)strlen(pHeader);
// if amazon and we have room, sign the request
if ((pVoipNarrate->Config.eProvider != VOIPNARRATE_PROVIDER_AMAZON) || (uHeaderSize < (unsigned)iHdrLen))
{
return(iHdrLen);
}
// sign the request and return the updated size
iHdrLen += AWSSignSigV4(pHeader, uHeaderSize, pVoipNarrate->pBody, pVoipNarrate->Config.strKey, "polly", NULL);
// return size to protohttp
return(iHdrLen);
}
/*F********************************************************************************/
/*!
\Function _VoipNarrateSkipWavHeader
\Description
Return offset past WAV header in input data
\Input *pData - pointer to wav header
\Input iDataLen - length of data
\Output
int32_t - offset past WAV header, or zero
\Version 11/06/2018 (jbrookes)
*/
/********************************************************************************F*/
static int32_t _VoipNarrateSkipWavHeader(const uint8_t *pData, int32_t iDataLen)
{
int32_t iOffset = 0, iChkLen;
uint8_t bFoundData;
// validate and skip RIFF/WAVE header
if ((iDataLen < 12) || ds_strnicmp((const char *)pData, "RIFF", 4) || ds_strnicmp((const char *)pData+8, "WAVE", 4))
{
return(0);
}
iOffset += 12;
// process chunks
for (bFoundData = FALSE; iOffset < (iDataLen+12); iOffset += iChkLen+8)
{
// get chunk length
iChkLen = pData[iOffset+4];
iChkLen |= pData[iOffset+5]<<8;
iChkLen |= pData[iOffset+6]<<16;
iChkLen |= pData[iOffset+7]<<24;
// look for data chunk
if (!ds_strnicmp((const char *)pData+iOffset, "data", 4))
{
bFoundData = TRUE;
iOffset += 8;
break;
}
}
return(bFoundData ? iOffset : 0);
}
/*F********************************************************************************/
/*!
\Function _VoipNarrateBasicAuth
\Description
Encode Basic HTTP authorization header as per https://tools.ietf.org/html/rfc7617
\Input *pBuffer - [out] output buffer for encoded base64 string
\Input iBufSize - size of output buffer
\Input *pUser - user identifer
\Input *pPass - user password
\Output
const char * - pointer to output buffer
\Version 10/25/2018 (jbrookes)
*/
/********************************************************************************F*/
static const char *_VoipNarrateBasicAuth(char *pBuffer, int32_t iBufSize, const char *pUser, const char *pPass)
{
char strAuth[128];
ds_snzprintf(strAuth, sizeof(strAuth), "%s:%s", pUser, pPass);
Base64Encode2(strAuth, (int32_t)strlen(strAuth), pBuffer, iBufSize);
return(pBuffer);
}
/*F********************************************************************************/
/*!
\Function _VoipNarrateBase64Decode
\Description
Decode Base64-encoded voice data
\Input *pVoipNarrate - pointer to module state
\Input *pOutput - [out] buffer to hold decoded voice data
\Input *pOutSize - [in/out] output buffer length, size of output data
\Input *pInput - base64-encoded input data
\Input iInpSize - input buffer length
\Output
int32_t - negative=failure, else input bytes consumed
\Version 10/27/2018 (jbrookes)
*/
/********************************************************************************F*/
static int32_t _VoipNarrateBase64Decode(VoipNarrateRefT *pVoipNarrate, char *pOutput, int32_t *pOutSize, const char *pInput, int32_t iInpSize)
{
static const char _strJson[] = "\"audioContent\":";
const char *pInput2, *pInpEnd = pInput + iInpSize;
int32_t iInpOff = 0;
// if we have the beginning of json envelope, skip it
if ((pInput2 = strstr(pInput, _strJson)) != NULL)
{
// skip json header
pInput2 += sizeof(_strJson);
// skip to base64 data
for (; (*pInput2 != '"') && (pInput2 < pInpEnd); pInput2 += 1)
;
if (*pInput2 != '"')
{
return(-1);
}
// skip quote
pInput2 += 1;
// remember to consume this in addition to base64 data
iInpOff = pInput2 - pInput;
pInput = pInput2;
}
// if we have end of json envelope, trim it
if ((pInput2 = strchr(pInput, '"')) != NULL)
{
// handle end of data
if (pInput2 == pInput)
{
iInpOff = iInpSize;
}
iInpSize = pInput2-pInput;
}
// constrain input size to what will fit in output buffer
if (iInpSize > Base64EncodedSize(*pOutSize))
{
iInpSize = Base64EncodedSize(*pOutSize);
}
// make sure input size is a multiple of four to produce an integral number of output bytes
iInpSize &= ~0x03;
// base64 decode and save output size
*pOutSize = Base64Decode3(pInput, iInpSize, pOutput, *pOutSize);
// return number of bytes of input consumed
return(iInpSize+iInpOff);
}
/*F********************************************************************************/
/*!
\Function _VoipNarrateRequestAdd
\Description
Queue request for later sending
\Input *pVoipNarrate - pointer to module state
\Input iUserIndex - local user index of user who is requesting speech synthesis
\Input eGender - preferred gender for voice narration
\Input *pText - text to be converted
\Output
int32_t - negative=failure, else success
\Version 11/09/2018 (jbrookes)
*/
/********************************************************************************F*/
static int32_t _VoipNarrateRequestAdd(VoipNarrateRefT *pVoipNarrate, int32_t iUserIndex, VoipNarrateGenderE eGender, const char *pText)
{
VoipNarrateRequestT *pRequest;
// allocate and clear the request
if ((pRequest = DirtyMemAlloc(sizeof(*pRequest), VOIPNARRATE_MEMID, pVoipNarrate->iMemGroup, pVoipNarrate->pMemGroupUserData)) == NULL)
{
NetPrintf(("voipnarrate: could not allocate request\n"));
pVoipNarrate->Metrics.uErrorCount += 1;
return(-1);
}
ds_memclr(pRequest, sizeof(*pRequest));
// copy the request data
ds_strnzcpy(pRequest->strText, pText, sizeof(pRequest->strText));
pRequest->iUserIndex = iUserIndex;
pRequest->eGender = eGender;
// add to queue
pRequest->pNext = pVoipNarrate->pRequest;
pVoipNarrate->pRequest = pRequest;
// return success
return(0);
}
/*F********************************************************************************/
/*!
\Function _VoipNarrateRequestGet
\Description
Get queued request
\Input *pVoipNarrate - pointer to module state
\Input *pRequest - [out] storage for request (may be null)
\Version 11/09/2018 (jbrookes)
*/
/********************************************************************************F*/
static void _VoipNarrateRequestGet(VoipNarrateRefT *pVoipNarrate, VoipNarrateRequestT *pRequest)
{
VoipNarrateRequestT **ppRequest;
// get oldest request (we add to head, so get from tail)
for (ppRequest = &pVoipNarrate->pRequest; (*ppRequest)->pNext != NULL; ppRequest = &((*ppRequest)->pNext))
;
// copy request
if (pRequest != NULL)
{
ds_memcpy_s(pRequest, sizeof(*pRequest), *ppRequest, sizeof(**ppRequest));
}
// free request
DirtyMemFree(*ppRequest, VOIPNARRATE_MEMID, pVoipNarrate->iMemGroup, pVoipNarrate->pMemGroupUserData);
// remove from list
*ppRequest = NULL;
}
/*F********************************************************************************/
/*!
\Function _VoipNarrateFormatHeadWatson
\Description
Format connection header for IBM Watson Speech service
Ref: https://console.bluemix.net/docs/services/text-to-speech/http.html#usingHTTP
\Input *pVoipNarrate - pointer to module state
\Input *pUrl - [out] buffer for formatted url
\Input iUrlLen - length of url buffer
\Input *pHead - [out] buffer for formatted request header
\Input iHeadLen - length of header buffer
\Input eGender - preferred gender for voice narration
\Output
int32_t - negative=failure, else success
\Version 11/07/2018 (jbrookes)
*/
/********************************************************************************F*/
static const char *_VoipNarrateFormatHeadWatson(VoipNarrateRefT *pVoipNarrate, char *pUrl, int32_t iUrlLen, char *pHead, int32_t iHeadLen, VoipNarrateGenderE eGender)
{
char strAuth[128];
int32_t iOffset=0;
// encode Basic authorization string with string apikey:<key>
_VoipNarrateBasicAuth(strAuth, sizeof(strAuth), "apikey", pVoipNarrate->Config.strKey);
// format request header
iOffset += ds_snzprintf(pHead+iOffset, iHeadLen-iOffset, "Content-Type: application/json\r\n");
iOffset += ds_snzprintf(pHead+iOffset, iHeadLen-iOffset, "Accept: audio/wav; rate=%d\r\n", VOIPNARRATE_SAMPLERATE);
iOffset += ds_snzprintf(pHead+iOffset, iHeadLen-iOffset, "Authorization: Basic %s\r\n", strAuth);
// format url with voice based on preferred gender
ds_snzprintf(pUrl, iUrlLen, "%s?voice=%s", pVoipNarrate->Config.strUrl, (eGender == VOIPNARRATE_GENDER_FEMALE) ? "en-US_AllisonVoice" : "en-US_MichaelVoice");
return(pUrl);
}
/*F********************************************************************************/
/*!
\Function _VoipNarrateFormatBodyWatson
\Description
Format request body for IBM Watson Speech service
\Input *pVoipNarrate - pointer to module state
\Input *pBody - [out] buffer to hold request body
\Input iBodyLen - buffer length
\Input *pText - pointer to text request
\Output
int32_t - negative=failure, else success
\Version 11/07/2018 (jbrookes)
*/
/********************************************************************************F*/
static char *_VoipNarrateFormatBodyWatson(VoipNarrateRefT *pVoipNarrate, char *pBody, int32_t iBodyLen, const char *pText)
{
JsonInit(pBody, iBodyLen, JSON_FL_WHITESPACE);
JsonAddStr(pBody, "text", pText);
pBody = JsonFinish(pBody);
return(pBody);
}
/*F********************************************************************************/
/*!
\Function _VoipNarrateFormatHeadMicrosoft
\Description
Format connection header for Microsoft Speech service
\Input *pVoipNarrate - pointer to module state
\Input *pUrl - [out] buffer for formatted url
\Input iUrlLen - length of url buffer
\Input *pHead - [out] buffer for formatted request header
\Input iHeadLen - length of header buffer
\Output
int32_t - negative=failure, else success
\Version 10/25/2018 (jbrookes)
*/
/********************************************************************************F*/
static const char *_VoipNarrateFormatHeadMicrosoft(VoipNarrateRefT *pVoipNarrate, char *pUrl, int32_t iUrlLen, char *pHead, int32_t iHeadLen)
{
int32_t iOffset=0;
// format request header
iOffset += ds_snzprintf(pHead+iOffset, iHeadLen-iOffset, "Content-Type: application/ssml+xml\r\n");
iOffset += ds_snzprintf(pHead+iOffset, iHeadLen-iOffset, "X-Microsoft-OutputFormat: raw-%dkhz-16bit-mono-pcm\r\n", VOIPNARRATE_SAMPLERATE/1000);
iOffset += ds_snzprintf(pHead+iOffset, iHeadLen-iOffset, "Ocp-Apim-Subscription-Key: %s\r\n", pVoipNarrate->Config.strKey);
// copy url
ds_strnzcpy(pUrl, pVoipNarrate->Config.strUrl, iUrlLen);
return(pUrl);
}
/*F********************************************************************************/
/*!
\Function _VoipNarrateFormatBodyMicrosoft
\Description
Format request body for Microsoft Speech service
\Input *pVoipNarrate - pointer to module state
\Input *pBody - [out] buffer to hold request body
\Input iBodyLen - buffer length
\Input eGender - preferred gender for voice narration
\Input *pText - pointer to text request
\Output
int32_t - negative=failure, else success
\Version 10/25/2018 (jbrookes)
*/
/********************************************************************************F*/
static char *_VoipNarrateFormatBodyMicrosoft(VoipNarrateRefT *pVoipNarrate, char *pBody, int32_t iBodyLen, VoipNarrateGenderE eGender, const char *pText)
{
int32_t iOffset=0;
// format request body
iOffset += ds_snzprintf(pBody+iOffset, iBodyLen-iOffset, "<speak version='1.0' xmlns=\"http://www.w3.org/2001/10/synthesis\" xml:lang='en-US'>");
iOffset += ds_snzprintf(pBody+iOffset, iBodyLen-iOffset, "<voice name='Microsoft Server Speech Text to Speech Voice (en-US, %s)'>%s</voice>",
(eGender == VOIPNARRATE_GENDER_FEMALE) ? "JessaRUS" : "BenjaminRUS", pText);
iOffset += ds_snzprintf(pBody+iOffset, iBodyLen-iOffset, "</speak>");
// return pointer to body
return(pBody);
}
/*F********************************************************************************/
/*!
\Function _VoipNarrateFormatHeadGoogle
\Description
Format connection header for Google Text to Speech
\Input *pVoipNarrate - pointer to module state
\Input *pUrl - [out] buffer for formatted url
\Input iUrlLen - length of url buffer
\Input *pHead - [out] buffer for formatted request header
\Input iHeadLen - length of header buffer
\Output
int32_t - negative=failure, else success
\Version 10/25/2018 (jbrookes)
*/
/********************************************************************************F*/
static const char *_VoipNarrateFormatHeadGoogle(VoipNarrateRefT *pVoipNarrate, char *pUrl, int32_t iUrlLen, char *pHead, int32_t iHeadLen)
{
// format request header
*pHead = '\0';
// format request url
ds_snzprintf(pUrl, iUrlLen, "%s?key=%s", pVoipNarrate->Config.strUrl, pVoipNarrate->Config.strKey);
// return url
return(pUrl);
}
/*F********************************************************************************/
/*!
\Function _VoipNarrateFormatBodyGoogle
\Description
Format request body for Google text-to-speech request
\Input *pVoipNarrate - pointer to module state
\Input *pBody - [out] buffer to hold request body
\Input iBodyLen - buffer length
\Input eGender - preferred gender for voice narration
\Input *pText - pointer to text request
\Output
int32_t - negative=failure, else success
\Notes
Ref: https://cloud.google.com/text-to-speech/docs/reference/rest/
\Version 10/25/2018 (jbrookes)
*/
/********************************************************************************F*/
static char * _VoipNarrateFormatBodyGoogle(VoipNarrateRefT *pVoipNarrate, char *pBody, int32_t iBodyLen, VoipNarrateGenderE eGender, const char *pText)
{
static const char *_strGender[VOIPNARRATE_NUMGENDERS] = { "SSML_VOICE_GENDER_UNSPECIFIED", "FEMALE", "MALE", "NEUTRAL" };
static const char *_strVoice[VOIPNARRATE_NUMGENDERS] = { "en-US-Standard-D", "en-US-Standard-C", "en-US-Standard-B", "en-US-Standard-D" };
JsonInit(pBody, iBodyLen, JSON_FL_WHITESPACE);
JsonObjectStart(pBody, "input");
JsonAddStr(pBody, "text", pText);
JsonObjectEnd(pBody);
JsonObjectStart(pBody, "voice");
JsonAddStr(pBody, "languageCode", "en-US");
JsonAddStr(pBody, "name", _strVoice[eGender]);
JsonAddStr(pBody, "ssmlGender", _strGender[eGender]); // we specify gender here, but it is unclear if it does anything
JsonObjectEnd(pBody);
JsonObjectStart(pBody, "audioConfig");
JsonAddStr(pBody, "audioEncoding", "LINEAR16");
JsonAddInt(pBody, "sampleRateHertz", VOIPNARRATE_SAMPLERATE);
JsonObjectEnd(pBody);
pBody = JsonFinish(pBody);
return(pBody);
}
/*F********************************************************************************/
/*!
\Function _VoipNarrateFormatHeadAmazon
\Description
Format connection header for Amazon Polly
\Input *pVoipNarrate - pointer to module state
\Input *pUrl - [out] buffer for formatted url
\Input iUrlLen - length of url buffer
\Input *pHead - [out] buffer for formatted request header
\Input iHeadLen - length of header buffer
\Output
int32_t - negative=failure, else success
\Version 11/21/2018 (jbrookes)
*/
/********************************************************************************F*/
static const char *_VoipNarrateFormatHeadAmazon(VoipNarrateRefT *pVoipNarrate, char *pUrl, int32_t iUrlLen, char *pHead, int32_t iHeadLen)
{
int32_t iOffset=0;
// format request header
iOffset += ds_snzprintf(pHead+iOffset, iHeadLen-iOffset, "Content-Type: application/json\r\n");
iOffset += ds_snzprintf(pHead+iOffset, iHeadLen-iOffset, "Accept: audio/wav; rate=%d\r\n", VOIPNARRATE_SAMPLERATE);
// copy url
ds_strnzcpy(pUrl, pVoipNarrate->Config.strUrl, iUrlLen);
return(pUrl);
}
/*F********************************************************************************/
/*!
\Function _VoipNarrateFormatBodyAmazon
\Description
Format request body for Amazon Polly
\Input *pVoipNarrate - pointer to module state
\Input *pBody - [out] buffer to hold request body
\Input iBodyLen - buffer length
\Input eGender - preferred gender for voice narration
\Input *pText - pointer to text request
\Output
int32_t - negative=failure, else success
\Version 12/21/2018 (jbrookes)
*/
/********************************************************************************F*/
static char *_VoipNarrateFormatBodyAmazon(VoipNarrateRefT *pVoipNarrate, char *pBody, int32_t iBodyLen, VoipNarrateGenderE eGender, const char *pText)
{
JsonInit(pBody, iBodyLen, JSON_FL_WHITESPACE);
JsonAddStr(pBody, "OutputFormat", "pcm");
JsonAddStr(pBody, "Text", pText);
JsonAddStr(pBody, "VoiceId", (eGender == VOIPNARRATE_GENDER_FEMALE) ? "Joanna" : "Joey");
pBody = JsonFinish(pBody);
return(pBody);
}
/*F********************************************************************************/
/*!
\Function _VoipNarrateStreamCallbackGoogle
\Description
Decode Base64-encoded voice data
\Input *pVoipNarrate - pointer to module state
\Input eStatus - ProtoStream status
\Input *pData - base64-encoded input data
\Input iDataSize - input buffer length
\Output
int32_t - negative=failure, else number of input bytes consumed
\Version 10/30/2018 (jbrookes)
*/
/********************************************************************************F*/
static int32_t _VoipNarrateStreamCallbackGoogle(VoipNarrateRefT *pVoipNarrate, ProtoStreamStatusE eStatus, const uint8_t *pData, int32_t iDataSize)
{
int32_t iDataRead, iDataDecoded;
// submit any base64-decoded data we have first
if (pVoipNarrate->iVoiceLen > 0)
{
// if start of stream, see if we need to skip WAV header
if (pVoipNarrate->bStart)
{
pVoipNarrate->iVoiceOff = _VoipNarrateSkipWavHeader(pVoipNarrate->aVoiceBuf, pVoipNarrate->iVoiceLen);
pVoipNarrate->iVoiceLen -= pVoipNarrate->iVoiceOff;
pVoipNarrate->bStart = FALSE;
}
// pass data to user
iDataRead = pVoipNarrate->pVoiceDataCb(pVoipNarrate, pVoipNarrate->iUserIndex, (const int16_t *)(pVoipNarrate->aVoiceBuf+pVoipNarrate->iVoiceOff), pVoipNarrate->iVoiceLen, pVoipNarrate->pUserData);
// mark data as read
pVoipNarrate->iVoiceOff += iDataRead;
pVoipNarrate->iVoiceLen -= iDataRead;
}
// if we don't have data to decode, or we still have decoded voice data that hasn't been consumed yet, don't decode more
if ((pVoipNarrate->iVoiceLen > 0) || (iDataSize <= 0))
{
return(0);
}
pVoipNarrate->iVoiceOff = 0;
// base64-decode voice data
if ((iDataRead = _VoipNarrateBase64Decode(pVoipNarrate, (char *)pVoipNarrate->aVoiceBuf, (iDataDecoded = sizeof(pVoipNarrate->aVoiceBuf), &iDataDecoded), (const char *)pData, iDataSize)) >= 0)
{
pVoipNarrate->iVoiceLen = iDataDecoded;
pVoipNarrate->iSamplesInPhrase += iDataDecoded;
pData = pVoipNarrate->aVoiceBuf;
}
else
{
NetPrintf(("voipnarrate: error; could not base64 decode data\n"));
NetPrintMem(pData, iDataSize, "base64 data");
pVoipNarrate->Metrics.uErrorCount += 1;
}
return(iDataRead);
}
/*F********************************************************************************/
/*!
\Function _VoipNarrateStreamCallback
\Description
Receive streamed voice data and submit it to callback
\Input *pProtoStream - ProtoStream module state
\Input eStatus - ProtoStream status
\Input *pData - base64-encoded input data
\Input iDataSize - input buffer length
\Input *pUserData - callback user data (VoipNarrate module ref)
\Output
int32_t - negative=failure, else number of input bytes consumed
\Version 10/30/2018 (jbrookes)
*/
/********************************************************************************F*/
static int32_t _VoipNarrateStreamCallback(ProtoStreamRefT *pProtoStream, ProtoStreamStatusE eStatus, const uint8_t *pData, int32_t iDataSize, void *pUserData)
{
VoipNarrateRefT *pVoipNarrate = (VoipNarrateRefT *)pUserData;
int32_t iDataRead, iResult;
char strError[256] = "";
// handle start callback notification
if (eStatus == PROTOSTREAM_STATUS_BEGIN)
{
pVoipNarrate->iSamplesInPhrase = 0;
pVoipNarrate->Metrics.uDelay += NetTickDiff(NetTick(), pVoipNarrate->uTtsStartTime);
pVoipNarrate->pVoiceDataCb(pVoipNarrate, pVoipNarrate->iUserIndex, (const int16_t *)pData, VOIPNARRATE_STREAM_START, pVoipNarrate->pUserData);
}
// handle end callback notification
if (eStatus == PROTOSTREAM_STATUS_DONE)
{
// save metrics
int32_t iPhraseDuration = ((pVoipNarrate->iSamplesInPhrase * 1000) / VOIPNARRATE_SAMPLERATE);
pVoipNarrate->Metrics.uDurationMsRecv += iPhraseDuration;
if (iPhraseDuration < VOIPNARRATE_EMPTY_AUDIO_THRESHOLD_MS)
{
pVoipNarrate->Metrics.uEmptyResultCount += 1;
}
// signal end of stream
pVoipNarrate->pVoiceDataCb(pVoipNarrate, pVoipNarrate->iUserIndex, (const int16_t *)pData, VOIPNARRATE_STREAM_END, pVoipNarrate->pUserData);
pVoipNarrate->bActive = FALSE;
// check for a completion result that is not successful, and log error response (if any) to debug output
if ((iResult = ProtoStreamStatus(pProtoStream, 'code', NULL, 0)) != PROTOHTTP_RESPONSE_SUCCESSFUL)
{
ProtoStreamStatus(pProtoStream, 'serr', strError, sizeof(strError));
NetPrintf(("voipnarrate: stream failed with http result %d:\n%s\n", iResult, strError));
pVoipNarrate->Metrics.uErrorCount += 1;
}
}
// read data and pass it to callback, processing as necessary
for (iDataRead = 0, iResult = 1; (iResult > 0) && (iDataSize > 0); )
{
if (pVoipNarrate->Config.eProvider != VOIPNARRATE_PROVIDER_GOOGLE)
{
// if start of stream, see if we need to skip WAV header
if ((pVoipNarrate->bStart) && (iDataSize > 0))
{
iDataRead = _VoipNarrateSkipWavHeader(pData, iDataSize);
pData += iDataRead;
iDataSize -= iDataRead;
pVoipNarrate->bStart = FALSE;
}
iResult = pVoipNarrate->pVoiceDataCb(pVoipNarrate, pVoipNarrate->iUserIndex, (const int16_t *)pData, iDataSize, pVoipNarrate->pUserData);
pVoipNarrate->iSamplesInPhrase += iResult;
}
else
{
// google-specific processing to deal with base64 encoded audio
iResult = _VoipNarrateStreamCallbackGoogle(pVoipNarrate, eStatus, pData, iDataSize);
}
iDataRead += iResult;
iDataSize -= iResult;
pData += iResult;
}
return(iDataRead);
}
/*F********************************************************************************/
/*!
\Function _VoipNarrateStart
\Description
Receive streamed voice data and submit it to callback
\Input *pVoipNarrate - pointer to module state
\Input iUserIndex - local user index of user who is requesting speech synthesis
\Input eGender - preferred gender for voice for narration
\Input *pText - pointer to text request
\Output
int32_t - ProtoStream request result
\Version 10/25/2018 (jbrookes)
*/
/********************************************************************************F*/
static int32_t _VoipNarrateStart(VoipNarrateRefT *pVoipNarrate, int32_t iUserIndex, VoipNarrateGenderE eGender, const char *pText)
{
const char *pUrl, *pReq;
char strUrl[256];
int32_t iResult;
// format header/url and request body
if (pVoipNarrate->Config.eProvider == VOIPNARRATE_PROVIDER_MICROSOFT)
{
pUrl = _VoipNarrateFormatHeadMicrosoft(pVoipNarrate, strUrl, sizeof(strUrl), pVoipNarrate->strHead, sizeof(pVoipNarrate->strHead));
pReq = _VoipNarrateFormatBodyMicrosoft(pVoipNarrate, pVoipNarrate->strBody, sizeof(pVoipNarrate->strBody), eGender, pText);
}
else if (pVoipNarrate->Config.eProvider == VOIPNARRATE_PROVIDER_GOOGLE)
{
pUrl = _VoipNarrateFormatHeadGoogle(pVoipNarrate, strUrl, sizeof(strUrl), pVoipNarrate->strHead, sizeof(pVoipNarrate->strHead));
pReq = _VoipNarrateFormatBodyGoogle(pVoipNarrate, pVoipNarrate->strBody, sizeof(pVoipNarrate->strBody), eGender, pText);
}
else if (pVoipNarrate->Config.eProvider == VOIPNARRATE_PROVIDER_IBMWATSON)
{
pUrl = _VoipNarrateFormatHeadWatson(pVoipNarrate, strUrl, sizeof(strUrl), pVoipNarrate->strHead, sizeof(pVoipNarrate->strHead), eGender);
pReq = _VoipNarrateFormatBodyWatson(pVoipNarrate, pVoipNarrate->strBody, sizeof(pVoipNarrate->strBody), pText);
}
else if (pVoipNarrate->Config.eProvider == VOIPNARRATE_PROVIDER_AMAZON)
{
pUrl = _VoipNarrateFormatHeadAmazon(pVoipNarrate, strUrl, sizeof(strUrl), pVoipNarrate->strHead, sizeof(pVoipNarrate->strHead));
pReq = _VoipNarrateFormatBodyAmazon(pVoipNarrate, pVoipNarrate->strBody, sizeof(pVoipNarrate->strBody), eGender, pText);
}
else
{
NetPrintf(("voipnarrate: undefined provider\n"));
return(-1);
}
NetPrintfVerbose((pVoipNarrate->iVerbose, 1, "voipnarrate: request body\n%s\n", pReq));
pVoipNarrate->pBody = pReq;
// set request header
ProtoStreamControl(pVoipNarrate->pProtoStream, 'apnd', 0, 0, pVoipNarrate->strHead);
pVoipNarrate->Metrics.uEventCount += 1;
pVoipNarrate->Metrics.uCharCountSent += (uint32_t)strlen(pText);
pVoipNarrate->uTtsStartTime = NetTick();
// make the request
if ((iResult = ProtoStreamOpen2(pVoipNarrate->pProtoStream, pUrl, pReq, PROTOSTREAM_FREQ_ONCE)) >= 0)
{
// mark as stream start and active
pVoipNarrate->bStart = pVoipNarrate->bActive = TRUE;
}
else
{
NetPrintf(("voipnarrate: failed to open stream\n"));
pVoipNarrate->Metrics.uErrorCount += 1;
}
// return to caller
return(iResult);
}
/*F********************************************************************************/
/*!
\Function _VoipNarrateConfig
\Description
Configure the VoipNarrate module
\Input *pVoipNarrate - pointer to module state
\Input *pConfig - module configuration to set
\Output
uint32_t - TRUE if configured successfully
\Version 11/07/2018 (jbrookes)
*/
/********************************************************************************F*/
static uint32_t _VoipNarrateConfig(VoipNarrateRefT *pVoipNarrate, VoipNarrateConfigT *pConfig)
{
uint8_t uRet = TRUE;
NetCritEnter(NULL);
if (pConfig->eProvider != VOIPNARRATE_PROVIDER_NONE)
{
ds_memcpy_s(&pVoipNarrate->Config, sizeof(pVoipNarrate->Config), pConfig, sizeof(*pConfig));
}
else
{
NetPrintfVerbose((pVoipNarrate->iVerbose, 0, "voipnarrate: narration disabled\n"));
ds_memclr(&pVoipNarrate->Config, sizeof(pVoipNarrate->Config));
uRet = FALSE;
}
NetCritLeave(NULL);
return(uRet);
}
/*** Public functions *************************************************************/
/*F********************************************************************************/
/*!
\Function VoipNarrateCreate
\Description
Create the narration module
\Input *pVoiceDataCb - callback used to provide voice data
\Input *pUserData - callback user data
\Output
VoipNarrateRefT * - new module state, or NULL
\Version 10/25/2018 (jbrookes)
*/
/********************************************************************************F*/
VoipNarrateRefT *VoipNarrateCreate(VoipNarrateVoiceDataCbT *pVoiceDataCb, void *pUserData)
{
VoipNarrateRefT *pVoipNarrate;
int32_t iMemGroup;
void *pMemGroupUserData;
// query current mem group data
DirtyMemGroupQuery(&iMemGroup, &pMemGroupUserData);
// validate callback
if (pVoiceDataCb == NULL)
{
NetPrintf(("voipnarrate: could not create module with null callback\n"));
return(NULL);
}
// allocate and init module state
if ((pVoipNarrate = DirtyMemAlloc(sizeof(*pVoipNarrate), VOIPNARRATE_MEMID, iMemGroup, pMemGroupUserData)) == NULL)
{
NetPrintf(("voipnarrate: could not allocate module state\n"));
return(NULL);
}
ds_memclr(pVoipNarrate, sizeof(*pVoipNarrate));
pVoipNarrate->iMemGroup = iMemGroup;
pVoipNarrate->pMemGroupUserData = pMemGroupUserData;
pVoipNarrate->pVoiceDataCb = pVoiceDataCb;
pVoipNarrate->pUserData = pUserData;
pVoipNarrate->iVerbose = 1;
// allocate streaming module with a buffer to hold up to 1s of 16khz 16bit streaming audio
if ((pVoipNarrate->pProtoStream = ProtoStreamCreate(16*2*1024)) == NULL)
{
VoipNarrateDestroy(pVoipNarrate);
return(NULL);
}
// set protostream callback with a 20ms call rate
ProtoStreamSetCallback(pVoipNarrate->pProtoStream, 20, _VoipNarrateStreamCallback, pVoipNarrate);
// set protostream minimum data amount (for base64 decoding; four is the minimum amount but that produces one and a half samples, so we choose eight)
ProtoStreamControl(pVoipNarrate->pProtoStream, 'minb', 8, 0, NULL);
// set protostream debug level
ProtoStreamControl(pVoipNarrate->pProtoStream, 'spam', 1, 0, NULL);
// set keepalive
ProtoStreamControl(pVoipNarrate->pProtoStream, 'keep', 1, 0, NULL);
// set protostream http custom header callback, used to sign AWS requests
ProtoStreamSetHttpCallback(pVoipNarrate->pProtoStream, _VoipNarrateCustomHeaderCb, NULL, pVoipNarrate);
// configure for particular provider
if (!_VoipNarrateConfig(pVoipNarrate, &_VoipNarrate_Config))
{
NetPrintf(("voipnarrate: could not configure for provider\n"));
VoipNarrateDestroy(pVoipNarrate);
return(NULL);
}
// return ref to caller
return(pVoipNarrate);
}
/*F********************************************************************************/
/*!
\Function VoipNarrateConfig
\Description
Set global state to configure the VoipNarrate modules
\Input eProvider - VOIPNARRATE_PROVIDER_* (VOIPNARRATE_PROVIDER_NONE to disable)
\Input *pUrl - pointer to url to use for tts requests
\Input *pKey - pointer to authentication key to use for tts requests
\Version 11/07/2018 (jbrookes)
*/
/********************************************************************************F*/
void VoipNarrateConfig(VoipNarrateProviderE eProvider, const char *pUrl, const char *pKey)
{
NetCritEnter(NULL);
_VoipNarrate_Config.eProvider = eProvider;
ds_strnzcpy(_VoipNarrate_Config.strUrl, pUrl, sizeof(_VoipNarrate_Config.strUrl));
ds_strnzcpy(_VoipNarrate_Config.strKey, pKey, sizeof(_VoipNarrate_Config.strKey));
NetCritLeave(NULL);
}
/*F********************************************************************************/
/*!
\Function VoipNarrateDestroy
\Description
Destroy the VoipNarrate module
\Input *pVoipNarrate - pointer to module state
\Version 10/25/2018 (jbrookes)
*/
/********************************************************************************F*/
void VoipNarrateDestroy(VoipNarrateRefT *pVoipNarrate)
{
// destroy protostream module, if allocated
if (pVoipNarrate->pProtoStream != NULL)
{
ProtoStreamDestroy(pVoipNarrate->pProtoStream);
}
// release any queued requests
while (pVoipNarrate->pRequest != NULL)
{
_VoipNarrateRequestGet(pVoipNarrate, NULL);
}
// dispose of module memory
DirtyMemFree(pVoipNarrate, VOIPNARRATE_MEMID, pVoipNarrate->iMemGroup, pVoipNarrate->pMemGroupUserData);
}
/*F********************************************************************************/
/*!
\Function VoipNarrateInput
\Description
Input text to be convert to speech
\Input *pVoipNarrate - pointer to module state
\Input iUserIndex - local user index of user who is requesting speech synthesis
\Input eGender - preferred gender for voice narration
\Input *pText - text to be converted
\Output
int32_t - zero=success, otherwise=failure
\Version 10/25/2018 (jbrookes)
*/
/********************************************************************************F*/
int32_t VoipNarrateInput(VoipNarrateRefT *pVoipNarrate, int32_t iUserIndex, VoipNarrateGenderE eGender, const char *pText)
{
// make sure a provider is configured
if (pVoipNarrate->Config.eProvider == VOIPNARRATE_PROVIDER_NONE)
{
NetPrintfVerbose((pVoipNarrate->iVerbose, 0, "voipnarrate: no provider configured\n"));
return(-1);
}
// handle if there is already narration ongoing
if (pVoipNarrate->bActive)
{
NetPrintfVerbose((pVoipNarrate->iVerbose, 1, "voipnarrate: queueing request '%s'\n", pText));
return(_VoipNarrateRequestAdd(pVoipNarrate, iUserIndex, eGender, pText));
}
// if ready, start the request
return(_VoipNarrateStart(pVoipNarrate, iUserIndex, eGender, pText));
}
/*F********************************************************************************/
/*!
\Function VoipNarrateStatus
\Description
Get module status.
\Input *pVoipNarrate - pointer to module state
\Input iStatus - status selector
\Input iValue - selector specific
\Input *pBuffer - selector specific
\Input iBufSize - selector specific
\Output
int32_t - selector specific
\Notes
Other status codes are passed down to the stream transport handler.
\verbatim
'ttsm' - get the VoipTextToSpeechMetricsT via pBuffer
\endverbatim
\Version 11/15/2018 (jbrookes)
*/
/********************************************************************************F*/
int32_t VoipNarrateStatus(VoipNarrateRefT *pVoipNarrate, int32_t iStatus, int32_t iValue, void *pBuffer, int32_t iBufSize)
{
if (iStatus == 'ttsm')
{
if ((pBuffer != NULL) && (iBufSize >= (int32_t)sizeof(VoipTextToSpeechMetricsT)))
{
ds_memcpy_s(pBuffer, iBufSize, &pVoipNarrate->Metrics, sizeof(VoipTextToSpeechMetricsT));
return(0);
}
return(-1);
}
return(ProtoStreamStatus(pVoipNarrate->pProtoStream, iStatus, pBuffer, iBufSize));
}
/*F********************************************************************************/
/*!
\Function VoipNarrateControl
\Description
Set control options
\Input *pVoipNarrate - pointer to module state
\Input iControl - control selector
\Input iValue - selector specific
\Input iValue2 - selector specific
\Input *pValue - selector specific
\Output
int32_t - selector specific
\Notes
iStatus can be one of the following:
\verbatim
'ctsm' - clear text to speech metrics in VoipTextToSpeechMetricsT
'spam' - set verbose debug level (debug only)
\endverbatim
Unhandled codes are passed through to the stream transport handler
\Version 08/30/2018 (jbrookes)
*/
/********************************************************************************F*/
int32_t VoipNarrateControl(VoipNarrateRefT *pVoipNarrate, int32_t iControl, int32_t iValue, int32_t iValue2, void *pValue)
{
if (iControl == 'ctsm')
{
ds_memclr(&(pVoipNarrate->Metrics), sizeof(pVoipNarrate->Metrics));
return(0);
}
#if DIRTYCODE_LOGGING
// set verbosity for us and pass through to stream transport handler
if (iControl == 'spam')
{
pVoipNarrate->iVerbose = iValue;
}
#endif
// if not handled, let stream transport handler take a stab at it
return(ProtoStreamControl(pVoipNarrate->pProtoStream, iControl, iValue, iValue2, pValue));
}
/*F********************************************************************************/
/*!
\Function VoipNarrateUpdate
\Description
Update the narration module
\Input *pVoipNarrate - pointer to module state
\Version 10/25/2018 (jbrookes)
*/
/********************************************************************************F*/
void VoipNarrateUpdate(VoipNarrateRefT *pVoipNarrate)
{
// see if we need to start a queued narration request
if ((pVoipNarrate->pRequest != NULL) && !pVoipNarrate->bActive)
{
VoipNarrateRequestT Request;
_VoipNarrateRequestGet(pVoipNarrate, &Request);
_VoipNarrateStart(pVoipNarrate, Request.iUserIndex, Request.eGender, Request.strText);
}
// give life to stream module
ProtoStreamUpdate(pVoipNarrate->pProtoStream);
}