From 3af7f3bbd258ee22b9bbe22674c1f94e951a2b58 Mon Sep 17 00:00:00 2001
From: McCloudS <64094529+McCloudS@users.noreply.github.com>
Date: Fri, 12 Jun 2026 14:18:23 +0300
Subject: [PATCH] Fixed whisperai provider audio stream matching by index to
 avoid s16le multi-stream failure

---
 .../subliminal_patch/providers/whisperai.py   | 25 +++++++++++--------
 1 file changed, 15 insertions(+), 10 deletions(-)

diff --git a/custom_libs/subliminal_patch/providers/whisperai.py b/custom_libs/subliminal_patch/providers/whisperai.py
index bfe57fc79..ff00fc4b0 100644
--- a/custom_libs/subliminal_patch/providers/whisperai.py
+++ b/custom_libs/subliminal_patch/providers/whisperai.py
@@ -446,18 +446,23 @@ class WhisperAIProvider(Provider):
             # This launches a subprocess to decode audio while down-mixing and resampling as necessary.
             inp = ffmpeg.input(path, threads=0)
             if audio_stream_language:
-                # There is more than one audio stream, so pick the requested one by name
-                # Use the ISO 639-2 code if available
                 audio_stream_language = wlm.get_ISO_639_2_code(audio_stream_language)
                 logger.debug(f"Whisper will use the '{audio_stream_language}' audio stream for {path}")
-                # 0 = Pick first stream in case there are multiple language streams of the same language,
-                # otherwise ffmpeg will try to combine multiple streams, but our output format doesn't support that.
-                # The first stream is probably the correct one, as later streams are usually commentaries
-                lang_map = f"0:a:m:language:{audio_stream_language}"
-                out = inp.output("-", format="s16le", acodec="pcm_s16le", ac=1, ar=16000, af=audio_filter,
-                                 map=lang_map)
-            else:
-                out = inp.output("-", format="s16le", acodec="pcm_s16le", ac=1, ar=16000, af=audio_filter)
+
+            # Probe for the first audio stream matching the requested language.
+            # Mapping by language tag alone (0:a:m:language:X) selects all streams sharing
+            # that tag — s16le only supports one audio stream, so files with duplicate-language
+            # tracks (e.g. stereo + 5.1 both tagged eng) would fail.
+            probe = ffmpeg.probe(path)
+            stream_index = next(
+                (s['index'] for s in probe['streams']
+                 if s['codec_type'] == 'audio'
+                 and s.get('tags', {}).get('language') == audio_stream_language),
+                None
+            ) if audio_stream_language else None
+            map_arg = f"0:{stream_index}" if stream_index is not None else "0:a:0"  # 0:a:0 = first audio stream, avoids mapping all streams if no language matched
+
+            out = inp.output("-", format="s16le", acodec="pcm_s16le", ac=1, ar=16000, af=audio_filter, map=map_arg)
 
             start_time = time.time()
             out, _ = out.run(cmd=[ffmpeg_path, "-nostdin"], capture_stdout=True, capture_stderr=True)