From a91b05907c4de3468f83ceda3b060bf7d8e2161f Mon Sep 17 00:00:00 2001 From: "LocalAI [bot]" <139863280+localai-bot@users.noreply.github.com> Date: Tue, 5 May 2026 01:50:17 +0200 Subject: [PATCH] feat(swagger): update swagger (#9660) Signed-off-by: github-actions[bot] <41898282+github-actions[bot]@users.noreply.github.com> Co-authored-by: mudler <2420543+mudler@users.noreply.github.com> --- swagger/docs.go | 124 +++++++++++++++++++++++++++++++++++++++++++ swagger/swagger.json | 124 +++++++++++++++++++++++++++++++++++++++++++ swagger/swagger.yaml | 98 ++++++++++++++++++++++++++++++++++ 3 files changed, 346 insertions(+) diff --git a/swagger/docs.go b/swagger/docs.go index 014ed6f23..8bc4d8383 100644 --- a/swagger/docs.go +++ b/swagger/docs.go @@ -1156,6 +1156,130 @@ const docTemplate = `{ } } }, + "/audio/transform": { + "post": { + "description": "Runs an audio-in / audio-out transform conditioned on an optional auxiliary reference signal. Concrete transforms include AEC + noise suppression + dereverberation (LocalVQE), voice conversion (reference = target speaker), and pitch shifting. The backend determines the operation; pass model-specific tuning via repeated ` + "`" + `params[\u003ckey\u003e]=\u003cvalue\u003e` + "`" + ` form fields.", + "consumes": [ + "multipart/form-data" + ], + "produces": [ + "audio/x-wav" + ], + "tags": [ + "audio" + ], + "summary": "Transform audio (echo cancellation, noise suppression, voice conversion, etc.)", + "parameters": [ + { + "type": "string", + "description": "model", + "name": "model", + "in": "formData", + "required": true + }, + { + "type": "file", + "description": "primary input audio file", + "name": "audio", + "in": "formData", + "required": true + }, + { + "type": "file", + "description": "auxiliary reference audio (loopback for AEC, target voice for conversion, etc.)", + "name": "reference", + "in": "formData" + }, + { + "type": "string", + "description": "wav | mp3 | ogg | flac", + "name": "response_format", + "in": "formData" + }, + { + "type": "integer", + "description": "desired output sample rate", + "name": "sample_rate", + "in": "formData" + } + ], + "responses": { + "200": { + "description": "transformed audio file", + "schema": { + "type": "string" + } + } + } + } + }, + "/audio/transformations": { + "post": { + "description": "Runs an audio-in / audio-out transform conditioned on an optional auxiliary reference signal. Concrete transforms include AEC + noise suppression + dereverberation (LocalVQE), voice conversion (reference = target speaker), and pitch shifting. The backend determines the operation; pass model-specific tuning via repeated ` + "`" + `params[\u003ckey\u003e]=\u003cvalue\u003e` + "`" + ` form fields.", + "consumes": [ + "multipart/form-data" + ], + "produces": [ + "audio/x-wav" + ], + "tags": [ + "audio" + ], + "summary": "Transform audio (echo cancellation, noise suppression, voice conversion, etc.)", + "parameters": [ + { + "type": "string", + "description": "model", + "name": "model", + "in": "formData", + "required": true + }, + { + "type": "file", + "description": "primary input audio file", + "name": "audio", + "in": "formData", + "required": true + }, + { + "type": "file", + "description": "auxiliary reference audio (loopback for AEC, target voice for conversion, etc.)", + "name": "reference", + "in": "formData" + }, + { + "type": "string", + "description": "wav | mp3 | ogg | flac", + "name": "response_format", + "in": "formData" + }, + { + "type": "integer", + "description": "desired output sample rate", + "name": "sample_rate", + "in": "formData" + } + ], + "responses": { + "200": { + "description": "transformed audio file", + "schema": { + "type": "string" + } + } + } + } + }, + "/audio/transformations/stream": { + "get": { + "description": "Streams binary PCM frames in (interleaved stereo: ch0=audio, ch1=reference) and out (mono). The first message must be a JSON ` + "`" + `session.update` + "`" + ` envelope describing model + sample format + frame size + backend params. Server emits binary PCM on the same cadence.", + "tags": [ + "audio" + ], + "summary": "Bidirectional realtime audio transform over WebSocket.", + "responses": {} + } + }, "/backend/monitor": { "get": { "tags": [ diff --git a/swagger/swagger.json b/swagger/swagger.json index 2f20b061a..f58ad5c4d 100644 --- a/swagger/swagger.json +++ b/swagger/swagger.json @@ -1153,6 +1153,130 @@ } } }, + "/audio/transform": { + "post": { + "description": "Runs an audio-in / audio-out transform conditioned on an optional auxiliary reference signal. Concrete transforms include AEC + noise suppression + dereverberation (LocalVQE), voice conversion (reference = target speaker), and pitch shifting. The backend determines the operation; pass model-specific tuning via repeated `params[\u003ckey\u003e]=\u003cvalue\u003e` form fields.", + "consumes": [ + "multipart/form-data" + ], + "produces": [ + "audio/x-wav" + ], + "tags": [ + "audio" + ], + "summary": "Transform audio (echo cancellation, noise suppression, voice conversion, etc.)", + "parameters": [ + { + "type": "string", + "description": "model", + "name": "model", + "in": "formData", + "required": true + }, + { + "type": "file", + "description": "primary input audio file", + "name": "audio", + "in": "formData", + "required": true + }, + { + "type": "file", + "description": "auxiliary reference audio (loopback for AEC, target voice for conversion, etc.)", + "name": "reference", + "in": "formData" + }, + { + "type": "string", + "description": "wav | mp3 | ogg | flac", + "name": "response_format", + "in": "formData" + }, + { + "type": "integer", + "description": "desired output sample rate", + "name": "sample_rate", + "in": "formData" + } + ], + "responses": { + "200": { + "description": "transformed audio file", + "schema": { + "type": "string" + } + } + } + } + }, + "/audio/transformations": { + "post": { + "description": "Runs an audio-in / audio-out transform conditioned on an optional auxiliary reference signal. Concrete transforms include AEC + noise suppression + dereverberation (LocalVQE), voice conversion (reference = target speaker), and pitch shifting. The backend determines the operation; pass model-specific tuning via repeated `params[\u003ckey\u003e]=\u003cvalue\u003e` form fields.", + "consumes": [ + "multipart/form-data" + ], + "produces": [ + "audio/x-wav" + ], + "tags": [ + "audio" + ], + "summary": "Transform audio (echo cancellation, noise suppression, voice conversion, etc.)", + "parameters": [ + { + "type": "string", + "description": "model", + "name": "model", + "in": "formData", + "required": true + }, + { + "type": "file", + "description": "primary input audio file", + "name": "audio", + "in": "formData", + "required": true + }, + { + "type": "file", + "description": "auxiliary reference audio (loopback for AEC, target voice for conversion, etc.)", + "name": "reference", + "in": "formData" + }, + { + "type": "string", + "description": "wav | mp3 | ogg | flac", + "name": "response_format", + "in": "formData" + }, + { + "type": "integer", + "description": "desired output sample rate", + "name": "sample_rate", + "in": "formData" + } + ], + "responses": { + "200": { + "description": "transformed audio file", + "schema": { + "type": "string" + } + } + } + } + }, + "/audio/transformations/stream": { + "get": { + "description": "Streams binary PCM frames in (interleaved stereo: ch0=audio, ch1=reference) and out (mono). The first message must be a JSON `session.update` envelope describing model + sample format + frame size + backend params. Server emits binary PCM on the same cadence.", + "tags": [ + "audio" + ], + "summary": "Bidirectional realtime audio transform over WebSocket.", + "responses": {} + } + }, "/backend/monitor": { "get": { "tags": [ diff --git a/swagger/swagger.yaml b/swagger/swagger.yaml index 94a0652b9..f54320173 100644 --- a/swagger/swagger.yaml +++ b/swagger/swagger.yaml @@ -2896,6 +2896,104 @@ paths: summary: Clear API traces tags: - monitoring + /audio/transform: + post: + consumes: + - multipart/form-data + description: Runs an audio-in / audio-out transform conditioned on an optional + auxiliary reference signal. Concrete transforms include AEC + noise suppression + + dereverberation (LocalVQE), voice conversion (reference = target speaker), + and pitch shifting. The backend determines the operation; pass model-specific + tuning via repeated `params[]=` form fields. + parameters: + - description: model + in: formData + name: model + required: true + type: string + - description: primary input audio file + in: formData + name: audio + required: true + type: file + - description: auxiliary reference audio (loopback for AEC, target voice for + conversion, etc.) + in: formData + name: reference + type: file + - description: wav | mp3 | ogg | flac + in: formData + name: response_format + type: string + - description: desired output sample rate + in: formData + name: sample_rate + type: integer + produces: + - audio/x-wav + responses: + "200": + description: transformed audio file + schema: + type: string + summary: Transform audio (echo cancellation, noise suppression, voice conversion, + etc.) + tags: + - audio + /audio/transformations: + post: + consumes: + - multipart/form-data + description: Runs an audio-in / audio-out transform conditioned on an optional + auxiliary reference signal. Concrete transforms include AEC + noise suppression + + dereverberation (LocalVQE), voice conversion (reference = target speaker), + and pitch shifting. The backend determines the operation; pass model-specific + tuning via repeated `params[]=` form fields. + parameters: + - description: model + in: formData + name: model + required: true + type: string + - description: primary input audio file + in: formData + name: audio + required: true + type: file + - description: auxiliary reference audio (loopback for AEC, target voice for + conversion, etc.) + in: formData + name: reference + type: file + - description: wav | mp3 | ogg | flac + in: formData + name: response_format + type: string + - description: desired output sample rate + in: formData + name: sample_rate + type: integer + produces: + - audio/x-wav + responses: + "200": + description: transformed audio file + schema: + type: string + summary: Transform audio (echo cancellation, noise suppression, voice conversion, + etc.) + tags: + - audio + /audio/transformations/stream: + get: + description: 'Streams binary PCM frames in (interleaved stereo: ch0=audio, ch1=reference) + and out (mono). The first message must be a JSON `session.update` envelope + describing model + sample format + frame size + backend params. Server emits + binary PCM on the same cadence.' + responses: {} + summary: Bidirectional realtime audio transform over WebSocket. + tags: + - audio /backend/monitor: get: parameters: