From a91b05907c4de3468f83ceda3b060bf7d8e2161f Mon Sep 17 00:00:00 2001
From: "LocalAI [bot]" <139863280+localai-bot@users.noreply.github.com>
Date: Tue, 5 May 2026 01:50:17 +0200
Subject: [PATCH] feat(swagger): update swagger (#9660)

Signed-off-by: github-actions[bot] <41898282+github-actions[bot]@users.noreply.github.com>
Co-authored-by: mudler <2420543+mudler@users.noreply.github.com>
---
 swagger/docs.go      | 124 +++++++++++++++++++++++++++++++++++++++++++
 swagger/swagger.json | 124 +++++++++++++++++++++++++++++++++++++++++++
 swagger/swagger.yaml |  98 ++++++++++++++++++++++++++++++++++
 3 files changed, 346 insertions(+)

diff --git a/swagger/docs.go b/swagger/docs.go
index 014ed6f23..8bc4d8383 100644
--- a/swagger/docs.go
+++ b/swagger/docs.go
@@ -1156,6 +1156,130 @@ const docTemplate = `{
                 }
             }
         },
+        "/audio/transform": {
+            "post": {
+                "description": "Runs an audio-in / audio-out transform conditioned on an optional auxiliary reference signal. Concrete transforms include AEC + noise suppression + dereverberation (LocalVQE), voice conversion (reference = target speaker), and pitch shifting. The backend determines the operation; pass model-specific tuning via repeated ` + "`" + `params[\u003ckey\u003e]=\u003cvalue\u003e` + "`" + ` form fields.",
+                "consumes": [
+                    "multipart/form-data"
+                ],
+                "produces": [
+                    "audio/x-wav"
+                ],
+                "tags": [
+                    "audio"
+                ],
+                "summary": "Transform audio (echo cancellation, noise suppression, voice conversion, etc.)",
+                "parameters": [
+                    {
+                        "type": "string",
+                        "description": "model",
+                        "name": "model",
+                        "in": "formData",
+                        "required": true
+                    },
+                    {
+                        "type": "file",
+                        "description": "primary input audio file",
+                        "name": "audio",
+                        "in": "formData",
+                        "required": true
+                    },
+                    {
+                        "type": "file",
+                        "description": "auxiliary reference audio (loopback for AEC, target voice for conversion, etc.)",
+                        "name": "reference",
+                        "in": "formData"
+                    },
+                    {
+                        "type": "string",
+                        "description": "wav | mp3 | ogg | flac",
+                        "name": "response_format",
+                        "in": "formData"
+                    },
+                    {
+                        "type": "integer",
+                        "description": "desired output sample rate",
+                        "name": "sample_rate",
+                        "in": "formData"
+                    }
+                ],
+                "responses": {
+                    "200": {
+                        "description": "transformed audio file",
+                        "schema": {
+                            "type": "string"
+                        }
+                    }
+                }
+            }
+        },
+        "/audio/transformations": {
+            "post": {
+                "description": "Runs an audio-in / audio-out transform conditioned on an optional auxiliary reference signal. Concrete transforms include AEC + noise suppression + dereverberation (LocalVQE), voice conversion (reference = target speaker), and pitch shifting. The backend determines the operation; pass model-specific tuning via repeated ` + "`" + `params[\u003ckey\u003e]=\u003cvalue\u003e` + "`" + ` form fields.",
+                "consumes": [
+                    "multipart/form-data"
+                ],
+                "produces": [
+                    "audio/x-wav"
+                ],
+                "tags": [
+                    "audio"
+                ],
+                "summary": "Transform audio (echo cancellation, noise suppression, voice conversion, etc.)",
+                "parameters": [
+                    {
+                        "type": "string",
+                        "description": "model",
+                        "name": "model",
+                        "in": "formData",
+                        "required": true
+                    },
+                    {
+                        "type": "file",
+                        "description": "primary input audio file",
+                        "name": "audio",
+                        "in": "formData",
+                        "required": true
+                    },
+                    {
+                        "type": "file",
+                        "description": "auxiliary reference audio (loopback for AEC, target voice for conversion, etc.)",
+                        "name": "reference",
+                        "in": "formData"
+                    },
+                    {
+                        "type": "string",
+                        "description": "wav | mp3 | ogg | flac",
+                        "name": "response_format",
+                        "in": "formData"
+                    },
+                    {
+                        "type": "integer",
+                        "description": "desired output sample rate",
+                        "name": "sample_rate",
+                        "in": "formData"
+                    }
+                ],
+                "responses": {
+                    "200": {
+                        "description": "transformed audio file",
+                        "schema": {
+                            "type": "string"
+                        }
+                    }
+                }
+            }
+        },
+        "/audio/transformations/stream": {
+            "get": {
+                "description": "Streams binary PCM frames in (interleaved stereo: ch0=audio, ch1=reference) and out (mono). The first message must be a JSON ` + "`" + `session.update` + "`" + ` envelope describing model + sample format + frame size + backend params. Server emits binary PCM on the same cadence.",
+                "tags": [
+                    "audio"
+                ],
+                "summary": "Bidirectional realtime audio transform over WebSocket.",
+                "responses": {}
+            }
+        },
         "/backend/monitor": {
             "get": {
                 "tags": [
diff --git a/swagger/swagger.json b/swagger/swagger.json
index 2f20b061a..f58ad5c4d 100644
--- a/swagger/swagger.json
+++ b/swagger/swagger.json
@@ -1153,6 +1153,130 @@
                 }
             }
         },
+        "/audio/transform": {
+            "post": {
+                "description": "Runs an audio-in / audio-out transform conditioned on an optional auxiliary reference signal. Concrete transforms include AEC + noise suppression + dereverberation (LocalVQE), voice conversion (reference = target speaker), and pitch shifting. The backend determines the operation; pass model-specific tuning via repeated `params[\u003ckey\u003e]=\u003cvalue\u003e` form fields.",
+                "consumes": [
+                    "multipart/form-data"
+                ],
+                "produces": [
+                    "audio/x-wav"
+                ],
+                "tags": [
+                    "audio"
+                ],
+                "summary": "Transform audio (echo cancellation, noise suppression, voice conversion, etc.)",
+                "parameters": [
+                    {
+                        "type": "string",
+                        "description": "model",
+                        "name": "model",
+                        "in": "formData",
+                        "required": true
+                    },
+                    {
+                        "type": "file",
+                        "description": "primary input audio file",
+                        "name": "audio",
+                        "in": "formData",
+                        "required": true
+                    },
+                    {
+                        "type": "file",
+                        "description": "auxiliary reference audio (loopback for AEC, target voice for conversion, etc.)",
+                        "name": "reference",
+                        "in": "formData"
+                    },
+                    {
+                        "type": "string",
+                        "description": "wav | mp3 | ogg | flac",
+                        "name": "response_format",
+                        "in": "formData"
+                    },
+                    {
+                        "type": "integer",
+                        "description": "desired output sample rate",
+                        "name": "sample_rate",
+                        "in": "formData"
+                    }
+                ],
+                "responses": {
+                    "200": {
+                        "description": "transformed audio file",
+                        "schema": {
+                            "type": "string"
+                        }
+                    }
+                }
+            }
+        },
+        "/audio/transformations": {
+            "post": {
+                "description": "Runs an audio-in / audio-out transform conditioned on an optional auxiliary reference signal. Concrete transforms include AEC + noise suppression + dereverberation (LocalVQE), voice conversion (reference = target speaker), and pitch shifting. The backend determines the operation; pass model-specific tuning via repeated `params[\u003ckey\u003e]=\u003cvalue\u003e` form fields.",
+                "consumes": [
+                    "multipart/form-data"
+                ],
+                "produces": [
+                    "audio/x-wav"
+                ],
+                "tags": [
+                    "audio"
+                ],
+                "summary": "Transform audio (echo cancellation, noise suppression, voice conversion, etc.)",
+                "parameters": [
+                    {
+                        "type": "string",
+                        "description": "model",
+                        "name": "model",
+                        "in": "formData",
+                        "required": true
+                    },
+                    {
+                        "type": "file",
+                        "description": "primary input audio file",
+                        "name": "audio",
+                        "in": "formData",
+                        "required": true
+                    },
+                    {
+                        "type": "file",
+                        "description": "auxiliary reference audio (loopback for AEC, target voice for conversion, etc.)",
+                        "name": "reference",
+                        "in": "formData"
+                    },
+                    {
+                        "type": "string",
+                        "description": "wav | mp3 | ogg | flac",
+                        "name": "response_format",
+                        "in": "formData"
+                    },
+                    {
+                        "type": "integer",
+                        "description": "desired output sample rate",
+                        "name": "sample_rate",
+                        "in": "formData"
+                    }
+                ],
+                "responses": {
+                    "200": {
+                        "description": "transformed audio file",
+                        "schema": {
+                            "type": "string"
+                        }
+                    }
+                }
+            }
+        },
+        "/audio/transformations/stream": {
+            "get": {
+                "description": "Streams binary PCM frames in (interleaved stereo: ch0=audio, ch1=reference) and out (mono). The first message must be a JSON `session.update` envelope describing model + sample format + frame size + backend params. Server emits binary PCM on the same cadence.",
+                "tags": [
+                    "audio"
+                ],
+                "summary": "Bidirectional realtime audio transform over WebSocket.",
+                "responses": {}
+            }
+        },
         "/backend/monitor": {
             "get": {
                 "tags": [
diff --git a/swagger/swagger.yaml b/swagger/swagger.yaml
index 94a0652b9..f54320173 100644
--- a/swagger/swagger.yaml
+++ b/swagger/swagger.yaml
@@ -2896,6 +2896,104 @@ paths:
       summary: Clear API traces
       tags:
       - monitoring
+  /audio/transform:
+    post:
+      consumes:
+      - multipart/form-data
+      description: Runs an audio-in / audio-out transform conditioned on an optional
+        auxiliary reference signal. Concrete transforms include AEC + noise suppression
+        + dereverberation (LocalVQE), voice conversion (reference = target speaker),
+        and pitch shifting. The backend determines the operation; pass model-specific
+        tuning via repeated `params[<key>]=<value>` form fields.
+      parameters:
+      - description: model
+        in: formData
+        name: model
+        required: true
+        type: string
+      - description: primary input audio file
+        in: formData
+        name: audio
+        required: true
+        type: file
+      - description: auxiliary reference audio (loopback for AEC, target voice for
+          conversion, etc.)
+        in: formData
+        name: reference
+        type: file
+      - description: wav | mp3 | ogg | flac
+        in: formData
+        name: response_format
+        type: string
+      - description: desired output sample rate
+        in: formData
+        name: sample_rate
+        type: integer
+      produces:
+      - audio/x-wav
+      responses:
+        "200":
+          description: transformed audio file
+          schema:
+            type: string
+      summary: Transform audio (echo cancellation, noise suppression, voice conversion,
+        etc.)
+      tags:
+      - audio
+  /audio/transformations:
+    post:
+      consumes:
+      - multipart/form-data
+      description: Runs an audio-in / audio-out transform conditioned on an optional
+        auxiliary reference signal. Concrete transforms include AEC + noise suppression
+        + dereverberation (LocalVQE), voice conversion (reference = target speaker),
+        and pitch shifting. The backend determines the operation; pass model-specific
+        tuning via repeated `params[<key>]=<value>` form fields.
+      parameters:
+      - description: model
+        in: formData
+        name: model
+        required: true
+        type: string
+      - description: primary input audio file
+        in: formData
+        name: audio
+        required: true
+        type: file
+      - description: auxiliary reference audio (loopback for AEC, target voice for
+          conversion, etc.)
+        in: formData
+        name: reference
+        type: file
+      - description: wav | mp3 | ogg | flac
+        in: formData
+        name: response_format
+        type: string
+      - description: desired output sample rate
+        in: formData
+        name: sample_rate
+        type: integer
+      produces:
+      - audio/x-wav
+      responses:
+        "200":
+          description: transformed audio file
+          schema:
+            type: string
+      summary: Transform audio (echo cancellation, noise suppression, voice conversion,
+        etc.)
+      tags:
+      - audio
+  /audio/transformations/stream:
+    get:
+      description: 'Streams binary PCM frames in (interleaved stereo: ch0=audio, ch1=reference)
+        and out (mono). The first message must be a JSON `session.update` envelope
+        describing model + sample format + frame size + backend params. Server emits
+        binary PCM on the same cadence.'
+      responses: {}
+      summary: Bidirectional realtime audio transform over WebSocket.
+      tags:
+      - audio
   /backend/monitor:
     get:
       parameters: