mirror of
https://github.com/mudler/LocalAI.git
synced 2026-05-29 19:19:19 -04:00
fix(stablediffusion-ggml): mux LTX-2 audio into output MP4 (#9990)
feat(stablediffusion-ggml): mux LTX-2 audio into output MP4 sd.cpp's generate_video now returns a sd_audio_t* alongside the video frames for models with an audio VAE (LTX-2.3). Our gosd wrapper was already collecting that pointer but immediately freed it without ever muxing it into the output, so LTX-2 generations landed as silent MP4s even though the audio VAE decode succeeded. Stage the planar float32 waveform to a temp WAV (IEEE float, header hand-built; samples interleaved on the fly), then add it as a second ffmpeg input with -c:a aac -map 0:v:0 -map 1:a:0 -shortest. The temp WAV is cleaned up unconditionally after ffmpeg exits, including on the write/waitpid error paths. Non-LTX models (Wan i2v / FLF2V) keep their current behaviour: audio arg is nullptr, the audio-related ffmpeg flags are not added, and no temp file is created. Assisted-by: Claude:claude-opus-4-7 Co-authored-by: Ettore Di Giacinto <mudler@localai.io>
This commit is contained in:
@@ -27,6 +27,7 @@
|
||||
#include <stdlib.h>
|
||||
#include <regex>
|
||||
#include <errno.h>
|
||||
#include <inttypes.h>
|
||||
#include <signal.h>
|
||||
#include <unistd.h>
|
||||
#include <sys/wait.h>
|
||||
@@ -1075,9 +1076,71 @@ static uint8_t* load_and_resize_image(const char* path, int target_width, int ta
|
||||
return buf;
|
||||
}
|
||||
|
||||
// Write sd.cpp's audio buffer to a temp WAV file (IEEE float, interleaved).
|
||||
// sd_audio_t.data is planar (all channel 0 samples, then channel 1, etc.) — we
|
||||
// interleave on the fly so ffmpeg's standard wav demuxer can read it directly.
|
||||
// Returns 0 on success and fills wav_path (must be at least 64 bytes).
|
||||
static int write_planar_float_wav(const sd_audio_t* a, char* wav_path, size_t wav_path_sz) {
|
||||
if (!a || !a->data || a->sample_count == 0 || a->channels == 0 || a->sample_rate == 0) {
|
||||
return -1;
|
||||
}
|
||||
|
||||
snprintf(wav_path, wav_path_sz, "/tmp/gosd-audio-XXXXXX.wav");
|
||||
int fd = mkstemps(wav_path, 4);
|
||||
if (fd < 0) { perror("mkstemps wav"); return -1; }
|
||||
FILE* f = fdopen(fd, "wb");
|
||||
if (!f) { perror("fdopen wav"); close(fd); return -1; }
|
||||
|
||||
uint64_t frames = a->sample_count;
|
||||
uint32_t channels = a->channels;
|
||||
uint32_t sample_rate = a->sample_rate;
|
||||
uint64_t total_samples64 = frames * (uint64_t)channels;
|
||||
uint64_t data_bytes64 = total_samples64 * sizeof(float);
|
||||
if (data_bytes64 > 0xFFFFFFFFull - 44) {
|
||||
fprintf(stderr, "audio too large for 32-bit WAV (%" PRIu64 " bytes)\n", data_bytes64);
|
||||
fclose(f);
|
||||
unlink(wav_path);
|
||||
return -1;
|
||||
}
|
||||
uint32_t data_bytes = (uint32_t)data_bytes64;
|
||||
uint32_t riff_size = 36 + data_bytes;
|
||||
uint16_t fmt_code = 3; // WAVE_FORMAT_IEEE_FLOAT
|
||||
uint16_t bits_per_sample = 32;
|
||||
uint16_t block_align = (uint16_t)(channels * sizeof(float));
|
||||
uint32_t byte_rate = sample_rate * block_align;
|
||||
uint16_t ch16 = (uint16_t)channels;
|
||||
uint32_t fmt_size = 16;
|
||||
|
||||
fwrite("RIFF", 1, 4, f);
|
||||
fwrite(&riff_size, 4, 1, f);
|
||||
fwrite("WAVEfmt ", 1, 8, f);
|
||||
fwrite(&fmt_size, 4, 1, f);
|
||||
fwrite(&fmt_code, 2, 1, f);
|
||||
fwrite(&ch16, 2, 1, f);
|
||||
fwrite(&sample_rate, 4, 1, f);
|
||||
fwrite(&byte_rate, 4, 1, f);
|
||||
fwrite(&block_align, 2, 1, f);
|
||||
fwrite(&bits_per_sample, 2, 1, f);
|
||||
fwrite("data", 1, 4, f);
|
||||
fwrite(&data_bytes, 4, 1, f);
|
||||
|
||||
// Interleave planar [ch0_samples..., ch1_samples...] → [ch0_s0, ch1_s0, ...]
|
||||
for (uint64_t s = 0; s < frames; s++) {
|
||||
for (uint32_t c = 0; c < channels; c++) {
|
||||
float v = a->data[(size_t)c * frames + s];
|
||||
fwrite(&v, sizeof(float), 1, f);
|
||||
}
|
||||
}
|
||||
fclose(f);
|
||||
return 0;
|
||||
}
|
||||
|
||||
// Pipe raw RGB/RGBA frames to ffmpeg stdin and let it produce an MP4 at dst.
|
||||
// Uses fork+execvp to avoid shell interpretation of dst.
|
||||
static int ffmpeg_mux_raw_to_mp4(sd_image_t* frames, int num_frames, int fps, const char* dst) {
|
||||
// Uses fork+execvp to avoid shell interpretation of dst. When `audio` is
|
||||
// non-null, the audio waveform is staged to a temp WAV and added as a second
|
||||
// ffmpeg input so the final MP4 contains both video and AAC audio.
|
||||
static int ffmpeg_mux_raw_to_mp4(sd_image_t* frames, int num_frames, int fps,
|
||||
const sd_audio_t* audio, const char* dst) {
|
||||
if (num_frames <= 0 || !frames || !frames[0].data) {
|
||||
fprintf(stderr, "ffmpeg_mux: empty frames\n");
|
||||
return 1;
|
||||
@@ -1092,38 +1155,87 @@ static int ffmpeg_mux_raw_to_mp4(sd_image_t* frames, int num_frames, int fps, co
|
||||
snprintf(size_str, sizeof(size_str), "%dx%d", width, height);
|
||||
snprintf(fps_str, sizeof(fps_str), "%d", fps);
|
||||
|
||||
// Optional audio: write a temp WAV file if the model produced audio.
|
||||
char wav_path[64] = {0};
|
||||
bool have_audio = false;
|
||||
if (audio && audio->data && audio->sample_count > 0 && audio->channels > 0 && audio->sample_rate > 0) {
|
||||
if (write_planar_float_wav(audio, wav_path, sizeof(wav_path)) == 0) {
|
||||
have_audio = true;
|
||||
fprintf(stderr, "ffmpeg_mux: audio %u Hz × %u ch × %" PRIu64 " frames → %s\n",
|
||||
audio->sample_rate, audio->channels, audio->sample_count, wav_path);
|
||||
} else {
|
||||
fprintf(stderr, "ffmpeg_mux: failed to stage audio; producing silent video\n");
|
||||
}
|
||||
}
|
||||
|
||||
int pipefd[2];
|
||||
if (pipe(pipefd) != 0) { perror("pipe"); return 1; }
|
||||
if (pipe(pipefd) != 0) {
|
||||
perror("pipe");
|
||||
if (have_audio) unlink(wav_path);
|
||||
return 1;
|
||||
}
|
||||
|
||||
pid_t pid = fork();
|
||||
if (pid < 0) { perror("fork"); close(pipefd[0]); close(pipefd[1]); return 1; }
|
||||
if (pid < 0) {
|
||||
perror("fork");
|
||||
close(pipefd[0]); close(pipefd[1]);
|
||||
if (have_audio) unlink(wav_path);
|
||||
return 1;
|
||||
}
|
||||
|
||||
if (pid == 0) {
|
||||
// child
|
||||
close(pipefd[1]);
|
||||
if (dup2(pipefd[0], STDIN_FILENO) < 0) { perror("dup2"); _exit(127); }
|
||||
close(pipefd[0]);
|
||||
std::vector<char*> argv = {
|
||||
const_cast<char*>("ffmpeg"),
|
||||
const_cast<char*>("-y"),
|
||||
const_cast<char*>("-hide_banner"),
|
||||
const_cast<char*>("-loglevel"), const_cast<char*>("warning"),
|
||||
const_cast<char*>("-f"), const_cast<char*>("rawvideo"),
|
||||
const_cast<char*>("-pix_fmt"), const_cast<char*>(pix_fmt_in),
|
||||
const_cast<char*>("-s"), size_str,
|
||||
const_cast<char*>("-framerate"), fps_str,
|
||||
const_cast<char*>("-i"), const_cast<char*>("-"),
|
||||
const_cast<char*>("-c:v"), const_cast<char*>("libx264"),
|
||||
const_cast<char*>("-pix_fmt"), const_cast<char*>("yuv420p"),
|
||||
const_cast<char*>("-movflags"), const_cast<char*>("+faststart"),
|
||||
// Force MP4 container. Distributed LocalAI hands us a staging
|
||||
// path (e.g. /staging/localai-output-NNN.tmp) with a non-standard
|
||||
// extension; relying on filename suffix makes ffmpeg bail with
|
||||
// "Unable to choose an output format".
|
||||
const_cast<char*>("-f"), const_cast<char*>("mp4"),
|
||||
const_cast<char*>(dst),
|
||||
nullptr
|
||||
};
|
||||
std::vector<char*> argv;
|
||||
argv.push_back(const_cast<char*>("ffmpeg"));
|
||||
argv.push_back(const_cast<char*>("-y"));
|
||||
argv.push_back(const_cast<char*>("-hide_banner"));
|
||||
argv.push_back(const_cast<char*>("-loglevel"));
|
||||
argv.push_back(const_cast<char*>("warning"));
|
||||
// Input 0: raw video from stdin
|
||||
argv.push_back(const_cast<char*>("-f"));
|
||||
argv.push_back(const_cast<char*>("rawvideo"));
|
||||
argv.push_back(const_cast<char*>("-pix_fmt"));
|
||||
argv.push_back(const_cast<char*>(pix_fmt_in));
|
||||
argv.push_back(const_cast<char*>("-s"));
|
||||
argv.push_back(size_str);
|
||||
argv.push_back(const_cast<char*>("-framerate"));
|
||||
argv.push_back(fps_str);
|
||||
argv.push_back(const_cast<char*>("-i"));
|
||||
argv.push_back(const_cast<char*>("-"));
|
||||
// Input 1: optional audio WAV
|
||||
if (have_audio) {
|
||||
argv.push_back(const_cast<char*>("-i"));
|
||||
argv.push_back(wav_path);
|
||||
argv.push_back(const_cast<char*>("-map"));
|
||||
argv.push_back(const_cast<char*>("0:v:0"));
|
||||
argv.push_back(const_cast<char*>("-map"));
|
||||
argv.push_back(const_cast<char*>("1:a:0"));
|
||||
argv.push_back(const_cast<char*>("-c:a"));
|
||||
argv.push_back(const_cast<char*>("aac"));
|
||||
argv.push_back(const_cast<char*>("-b:a"));
|
||||
argv.push_back(const_cast<char*>("192k"));
|
||||
// -shortest so the final clip ends with the shorter of the two
|
||||
// streams — guards against an audio buffer that overshoots the
|
||||
// video duration (or vice versa) on certain LTX variants.
|
||||
argv.push_back(const_cast<char*>("-shortest"));
|
||||
}
|
||||
argv.push_back(const_cast<char*>("-c:v"));
|
||||
argv.push_back(const_cast<char*>("libx264"));
|
||||
argv.push_back(const_cast<char*>("-pix_fmt"));
|
||||
argv.push_back(const_cast<char*>("yuv420p"));
|
||||
argv.push_back(const_cast<char*>("-movflags"));
|
||||
argv.push_back(const_cast<char*>("+faststart"));
|
||||
// Force MP4 container. Distributed LocalAI hands us a staging
|
||||
// path (e.g. /staging/localai-output-NNN.tmp) with a non-standard
|
||||
// extension; relying on filename suffix makes ffmpeg bail with
|
||||
// "Unable to choose an output format".
|
||||
argv.push_back(const_cast<char*>("-f"));
|
||||
argv.push_back(const_cast<char*>("mp4"));
|
||||
argv.push_back(const_cast<char*>(dst));
|
||||
argv.push_back(nullptr);
|
||||
execvp(argv[0], argv.data());
|
||||
perror("execvp ffmpeg");
|
||||
_exit(127);
|
||||
@@ -1148,6 +1260,7 @@ static int ffmpeg_mux_raw_to_mp4(sd_image_t* frames, int num_frames, int fps, co
|
||||
close(pipefd[1]);
|
||||
int status;
|
||||
waitpid(pid, &status, 0);
|
||||
if (have_audio) unlink(wav_path);
|
||||
return 1;
|
||||
}
|
||||
p += n;
|
||||
@@ -1158,8 +1271,13 @@ static int ffmpeg_mux_raw_to_mp4(sd_image_t* frames, int num_frames, int fps, co
|
||||
|
||||
int status = 0;
|
||||
while (waitpid(pid, &status, 0) < 0) {
|
||||
if (errno != EINTR) { perror("waitpid"); return 1; }
|
||||
if (errno != EINTR) {
|
||||
perror("waitpid");
|
||||
if (have_audio) unlink(wav_path);
|
||||
return 1;
|
||||
}
|
||||
}
|
||||
if (have_audio) unlink(wav_path);
|
||||
if (!WIFEXITED(status) || WEXITSTATUS(status) != 0) {
|
||||
fprintf(stderr, "ffmpeg exited with status %d\n", status);
|
||||
return 1;
|
||||
@@ -1234,7 +1352,7 @@ int gen_video(sd_vid_gen_params_t *p, int steps, char *dst, float cfg_scale, int
|
||||
|
||||
fprintf(stderr, "Generated %d frames, muxing to %s via ffmpeg\n", num_frames_out, dst);
|
||||
|
||||
int rc = ffmpeg_mux_raw_to_mp4(frames, num_frames_out, fps, dst);
|
||||
int rc = ffmpeg_mux_raw_to_mp4(frames, num_frames_out, fps, audio, dst);
|
||||
|
||||
for (int i = 0; i < num_frames_out; i++) {
|
||||
if (frames[i].data) free(frames[i].data);
|
||||
|
||||
Reference in New Issue
Block a user