fix(stablediffusion-ggml): mux LTX-2 audio into output MP4 (#9990)

feat(stablediffusion-ggml): mux LTX-2 audio into output MP4

sd.cpp's generate_video now returns a sd_audio_t* alongside the video
frames for models with an audio VAE (LTX-2.3). Our gosd wrapper was
already collecting that pointer but immediately freed it without ever
muxing it into the output, so LTX-2 generations landed as silent MP4s
even though the audio VAE decode succeeded.

Stage the planar float32 waveform to a temp WAV (IEEE float, header
hand-built; samples interleaved on the fly), then add it as a second
ffmpeg input with -c:a aac -map 0:v:0 -map 1:a:0 -shortest. The temp
WAV is cleaned up unconditionally after ffmpeg exits, including on
the write/waitpid error paths.

Non-LTX models (Wan i2v / FLF2V) keep their current behaviour: audio
arg is nullptr, the audio-related ffmpeg flags are not added, and no
temp file is created.

Assisted-by: Claude:claude-opus-4-7

Co-authored-by: Ettore Di Giacinto <mudler@localai.io>
This commit is contained in:
LocalAI [bot]
2026-05-25 22:40:16 +02:00
committed by GitHub
parent 1c6c3adad6
commit de2ce74bea

View File

@@ -27,6 +27,7 @@
#include <stdlib.h>
#include <regex>
#include <errno.h>
#include <inttypes.h>
#include <signal.h>
#include <unistd.h>
#include <sys/wait.h>
@@ -1075,9 +1076,71 @@ static uint8_t* load_and_resize_image(const char* path, int target_width, int ta
return buf;
}
// Write sd.cpp's audio buffer to a temp WAV file (IEEE float, interleaved).
// sd_audio_t.data is planar (all channel 0 samples, then channel 1, etc.) — we
// interleave on the fly so ffmpeg's standard wav demuxer can read it directly.
// Returns 0 on success and fills wav_path (must be at least 64 bytes).
static int write_planar_float_wav(const sd_audio_t* a, char* wav_path, size_t wav_path_sz) {
if (!a || !a->data || a->sample_count == 0 || a->channels == 0 || a->sample_rate == 0) {
return -1;
}
snprintf(wav_path, wav_path_sz, "/tmp/gosd-audio-XXXXXX.wav");
int fd = mkstemps(wav_path, 4);
if (fd < 0) { perror("mkstemps wav"); return -1; }
FILE* f = fdopen(fd, "wb");
if (!f) { perror("fdopen wav"); close(fd); return -1; }
uint64_t frames = a->sample_count;
uint32_t channels = a->channels;
uint32_t sample_rate = a->sample_rate;
uint64_t total_samples64 = frames * (uint64_t)channels;
uint64_t data_bytes64 = total_samples64 * sizeof(float);
if (data_bytes64 > 0xFFFFFFFFull - 44) {
fprintf(stderr, "audio too large for 32-bit WAV (%" PRIu64 " bytes)\n", data_bytes64);
fclose(f);
unlink(wav_path);
return -1;
}
uint32_t data_bytes = (uint32_t)data_bytes64;
uint32_t riff_size = 36 + data_bytes;
uint16_t fmt_code = 3; // WAVE_FORMAT_IEEE_FLOAT
uint16_t bits_per_sample = 32;
uint16_t block_align = (uint16_t)(channels * sizeof(float));
uint32_t byte_rate = sample_rate * block_align;
uint16_t ch16 = (uint16_t)channels;
uint32_t fmt_size = 16;
fwrite("RIFF", 1, 4, f);
fwrite(&riff_size, 4, 1, f);
fwrite("WAVEfmt ", 1, 8, f);
fwrite(&fmt_size, 4, 1, f);
fwrite(&fmt_code, 2, 1, f);
fwrite(&ch16, 2, 1, f);
fwrite(&sample_rate, 4, 1, f);
fwrite(&byte_rate, 4, 1, f);
fwrite(&block_align, 2, 1, f);
fwrite(&bits_per_sample, 2, 1, f);
fwrite("data", 1, 4, f);
fwrite(&data_bytes, 4, 1, f);
// Interleave planar [ch0_samples..., ch1_samples...] → [ch0_s0, ch1_s0, ...]
for (uint64_t s = 0; s < frames; s++) {
for (uint32_t c = 0; c < channels; c++) {
float v = a->data[(size_t)c * frames + s];
fwrite(&v, sizeof(float), 1, f);
}
}
fclose(f);
return 0;
}
// Pipe raw RGB/RGBA frames to ffmpeg stdin and let it produce an MP4 at dst.
// Uses fork+execvp to avoid shell interpretation of dst.
static int ffmpeg_mux_raw_to_mp4(sd_image_t* frames, int num_frames, int fps, const char* dst) {
// Uses fork+execvp to avoid shell interpretation of dst. When `audio` is
// non-null, the audio waveform is staged to a temp WAV and added as a second
// ffmpeg input so the final MP4 contains both video and AAC audio.
static int ffmpeg_mux_raw_to_mp4(sd_image_t* frames, int num_frames, int fps,
const sd_audio_t* audio, const char* dst) {
if (num_frames <= 0 || !frames || !frames[0].data) {
fprintf(stderr, "ffmpeg_mux: empty frames\n");
return 1;
@@ -1092,38 +1155,87 @@ static int ffmpeg_mux_raw_to_mp4(sd_image_t* frames, int num_frames, int fps, co
snprintf(size_str, sizeof(size_str), "%dx%d", width, height);
snprintf(fps_str, sizeof(fps_str), "%d", fps);
// Optional audio: write a temp WAV file if the model produced audio.
char wav_path[64] = {0};
bool have_audio = false;
if (audio && audio->data && audio->sample_count > 0 && audio->channels > 0 && audio->sample_rate > 0) {
if (write_planar_float_wav(audio, wav_path, sizeof(wav_path)) == 0) {
have_audio = true;
fprintf(stderr, "ffmpeg_mux: audio %u Hz × %u ch × %" PRIu64 " frames → %s\n",
audio->sample_rate, audio->channels, audio->sample_count, wav_path);
} else {
fprintf(stderr, "ffmpeg_mux: failed to stage audio; producing silent video\n");
}
}
int pipefd[2];
if (pipe(pipefd) != 0) { perror("pipe"); return 1; }
if (pipe(pipefd) != 0) {
perror("pipe");
if (have_audio) unlink(wav_path);
return 1;
}
pid_t pid = fork();
if (pid < 0) { perror("fork"); close(pipefd[0]); close(pipefd[1]); return 1; }
if (pid < 0) {
perror("fork");
close(pipefd[0]); close(pipefd[1]);
if (have_audio) unlink(wav_path);
return 1;
}
if (pid == 0) {
// child
close(pipefd[1]);
if (dup2(pipefd[0], STDIN_FILENO) < 0) { perror("dup2"); _exit(127); }
close(pipefd[0]);
std::vector<char*> argv = {
const_cast<char*>("ffmpeg"),
const_cast<char*>("-y"),
const_cast<char*>("-hide_banner"),
const_cast<char*>("-loglevel"), const_cast<char*>("warning"),
const_cast<char*>("-f"), const_cast<char*>("rawvideo"),
const_cast<char*>("-pix_fmt"), const_cast<char*>(pix_fmt_in),
const_cast<char*>("-s"), size_str,
const_cast<char*>("-framerate"), fps_str,
const_cast<char*>("-i"), const_cast<char*>("-"),
const_cast<char*>("-c:v"), const_cast<char*>("libx264"),
const_cast<char*>("-pix_fmt"), const_cast<char*>("yuv420p"),
const_cast<char*>("-movflags"), const_cast<char*>("+faststart"),
// Force MP4 container. Distributed LocalAI hands us a staging
// path (e.g. /staging/localai-output-NNN.tmp) with a non-standard
// extension; relying on filename suffix makes ffmpeg bail with
// "Unable to choose an output format".
const_cast<char*>("-f"), const_cast<char*>("mp4"),
const_cast<char*>(dst),
nullptr
};
std::vector<char*> argv;
argv.push_back(const_cast<char*>("ffmpeg"));
argv.push_back(const_cast<char*>("-y"));
argv.push_back(const_cast<char*>("-hide_banner"));
argv.push_back(const_cast<char*>("-loglevel"));
argv.push_back(const_cast<char*>("warning"));
// Input 0: raw video from stdin
argv.push_back(const_cast<char*>("-f"));
argv.push_back(const_cast<char*>("rawvideo"));
argv.push_back(const_cast<char*>("-pix_fmt"));
argv.push_back(const_cast<char*>(pix_fmt_in));
argv.push_back(const_cast<char*>("-s"));
argv.push_back(size_str);
argv.push_back(const_cast<char*>("-framerate"));
argv.push_back(fps_str);
argv.push_back(const_cast<char*>("-i"));
argv.push_back(const_cast<char*>("-"));
// Input 1: optional audio WAV
if (have_audio) {
argv.push_back(const_cast<char*>("-i"));
argv.push_back(wav_path);
argv.push_back(const_cast<char*>("-map"));
argv.push_back(const_cast<char*>("0:v:0"));
argv.push_back(const_cast<char*>("-map"));
argv.push_back(const_cast<char*>("1:a:0"));
argv.push_back(const_cast<char*>("-c:a"));
argv.push_back(const_cast<char*>("aac"));
argv.push_back(const_cast<char*>("-b:a"));
argv.push_back(const_cast<char*>("192k"));
// -shortest so the final clip ends with the shorter of the two
// streams — guards against an audio buffer that overshoots the
// video duration (or vice versa) on certain LTX variants.
argv.push_back(const_cast<char*>("-shortest"));
}
argv.push_back(const_cast<char*>("-c:v"));
argv.push_back(const_cast<char*>("libx264"));
argv.push_back(const_cast<char*>("-pix_fmt"));
argv.push_back(const_cast<char*>("yuv420p"));
argv.push_back(const_cast<char*>("-movflags"));
argv.push_back(const_cast<char*>("+faststart"));
// Force MP4 container. Distributed LocalAI hands us a staging
// path (e.g. /staging/localai-output-NNN.tmp) with a non-standard
// extension; relying on filename suffix makes ffmpeg bail with
// "Unable to choose an output format".
argv.push_back(const_cast<char*>("-f"));
argv.push_back(const_cast<char*>("mp4"));
argv.push_back(const_cast<char*>(dst));
argv.push_back(nullptr);
execvp(argv[0], argv.data());
perror("execvp ffmpeg");
_exit(127);
@@ -1148,6 +1260,7 @@ static int ffmpeg_mux_raw_to_mp4(sd_image_t* frames, int num_frames, int fps, co
close(pipefd[1]);
int status;
waitpid(pid, &status, 0);
if (have_audio) unlink(wav_path);
return 1;
}
p += n;
@@ -1158,8 +1271,13 @@ static int ffmpeg_mux_raw_to_mp4(sd_image_t* frames, int num_frames, int fps, co
int status = 0;
while (waitpid(pid, &status, 0) < 0) {
if (errno != EINTR) { perror("waitpid"); return 1; }
if (errno != EINTR) {
perror("waitpid");
if (have_audio) unlink(wav_path);
return 1;
}
}
if (have_audio) unlink(wav_path);
if (!WIFEXITED(status) || WEXITSTATUS(status) != 0) {
fprintf(stderr, "ffmpeg exited with status %d\n", status);
return 1;
@@ -1234,7 +1352,7 @@ int gen_video(sd_vid_gen_params_t *p, int steps, char *dst, float cfg_scale, int
fprintf(stderr, "Generated %d frames, muxing to %s via ffmpeg\n", num_frames_out, dst);
int rc = ffmpeg_mux_raw_to_mp4(frames, num_frames_out, fps, dst);
int rc = ffmpeg_mux_raw_to_mp4(frames, num_frames_out, fps, audio, dst);
for (int i = 0; i < num_frames_out; i++) {
if (frames[i].data) free(frames[i].data);