mirror of
https://github.com/mudler/LocalAI.git
synced 2026-04-04 15:04:27 -04:00
feat(diffusers): add support for wan2.2
Signed-off-by: Ettore Di Giacinto <mudler@localai.io>
This commit is contained in:
@@ -312,15 +312,17 @@ message GenerateImageRequest {
|
||||
|
||||
message GenerateVideoRequest {
|
||||
string prompt = 1;
|
||||
string start_image = 2; // Path or base64 encoded image for the start frame
|
||||
string end_image = 3; // Path or base64 encoded image for the end frame
|
||||
int32 width = 4;
|
||||
int32 height = 5;
|
||||
int32 num_frames = 6; // Number of frames to generate
|
||||
int32 fps = 7; // Frames per second
|
||||
int32 seed = 8;
|
||||
float cfg_scale = 9; // Classifier-free guidance scale
|
||||
string dst = 10; // Output path for the generated video
|
||||
string negative_prompt = 2; // Negative prompt for video generation
|
||||
string start_image = 3; // Path or base64 encoded image for the start frame
|
||||
string end_image = 4; // Path or base64 encoded image for the end frame
|
||||
int32 width = 5;
|
||||
int32 height = 6;
|
||||
int32 num_frames = 7; // Number of frames to generate
|
||||
int32 fps = 8; // Frames per second
|
||||
int32 seed = 9;
|
||||
float cfg_scale = 10; // Classifier-free guidance scale
|
||||
int32 step = 11; // Number of inference steps
|
||||
string dst = 12; // Output path for the generated video
|
||||
}
|
||||
|
||||
message TTSRequest {
|
||||
|
||||
@@ -18,7 +18,7 @@ import backend_pb2_grpc
|
||||
import grpc
|
||||
|
||||
from diffusers import SanaPipeline, StableDiffusion3Pipeline, StableDiffusionXLPipeline, StableDiffusionDepth2ImgPipeline, DPMSolverMultistepScheduler, StableDiffusionPipeline, DiffusionPipeline, \
|
||||
EulerAncestralDiscreteScheduler, FluxPipeline, FluxTransformer2DModel, QwenImageEditPipeline
|
||||
EulerAncestralDiscreteScheduler, FluxPipeline, FluxTransformer2DModel, QwenImageEditPipeline, AutoencoderKLWan, WanPipeline, WanImageToVideoPipeline
|
||||
from diffusers import StableDiffusionImg2ImgPipeline, AutoPipelineForText2Image, ControlNetModel, StableVideoDiffusionPipeline, Lumina2Text2ImgPipeline
|
||||
from diffusers.pipelines.stable_diffusion import safety_checker
|
||||
from diffusers.utils import load_image, export_to_video
|
||||
@@ -334,6 +334,32 @@ class BackendServicer(backend_pb2_grpc.BackendServicer):
|
||||
torch_dtype=torch.bfloat16)
|
||||
self.pipe.vae.to(torch.bfloat16)
|
||||
self.pipe.text_encoder.to(torch.bfloat16)
|
||||
elif request.PipelineType == "WanPipeline":
|
||||
# WAN2.2 pipeline requires special VAE handling
|
||||
vae = AutoencoderKLWan.from_pretrained(
|
||||
request.Model,
|
||||
subfolder="vae",
|
||||
torch_dtype=torch.float32
|
||||
)
|
||||
self.pipe = WanPipeline.from_pretrained(
|
||||
request.Model,
|
||||
vae=vae,
|
||||
torch_dtype=torchType
|
||||
)
|
||||
self.txt2vid = True # WAN2.2 is a text-to-video pipeline
|
||||
elif request.PipelineType == "WanImageToVideoPipeline":
|
||||
# WAN2.2 image-to-video pipeline
|
||||
vae = AutoencoderKLWan.from_pretrained(
|
||||
request.Model,
|
||||
subfolder="vae",
|
||||
torch_dtype=torch.float32
|
||||
)
|
||||
self.pipe = WanImageToVideoPipeline.from_pretrained(
|
||||
request.Model,
|
||||
vae=vae,
|
||||
torch_dtype=torchType
|
||||
)
|
||||
self.img2vid = True # WAN2.2 image-to-video pipeline
|
||||
|
||||
if CLIPSKIP and request.CLIPSkip != 0:
|
||||
self.clip_skip = request.CLIPSkip
|
||||
@@ -575,6 +601,96 @@ class BackendServicer(backend_pb2_grpc.BackendServicer):
|
||||
|
||||
return backend_pb2.Result(message="Media generated", success=True)
|
||||
|
||||
def GenerateVideo(self, request, context):
|
||||
try:
|
||||
prompt = request.prompt
|
||||
if not prompt:
|
||||
return backend_pb2.Result(success=False, message="No prompt provided for video generation")
|
||||
|
||||
# Set default values from request or use defaults
|
||||
num_frames = request.num_frames if request.num_frames > 0 else 81
|
||||
fps = request.fps if request.fps > 0 else 16
|
||||
cfg_scale = request.cfg_scale if request.cfg_scale > 0 else 4.0
|
||||
num_inference_steps = request.step if request.step > 0 else 40
|
||||
|
||||
# Prepare generation parameters
|
||||
kwargs = {
|
||||
"prompt": prompt,
|
||||
"negative_prompt": request.negative_prompt if request.negative_prompt else "",
|
||||
"height": request.height if request.height > 0 else 720,
|
||||
"width": request.width if request.width > 0 else 1280,
|
||||
"num_frames": num_frames,
|
||||
"guidance_scale": cfg_scale,
|
||||
"num_inference_steps": num_inference_steps,
|
||||
}
|
||||
|
||||
# Add custom options from self.options (including guidance_scale_2 if specified)
|
||||
kwargs.update(self.options)
|
||||
|
||||
# Set seed if provided
|
||||
if request.seed > 0:
|
||||
kwargs["generator"] = torch.Generator(device=self.device).manual_seed(request.seed)
|
||||
|
||||
# Handle start and end images for video generation
|
||||
if request.start_image:
|
||||
kwargs["start_image"] = load_image(request.start_image)
|
||||
if request.end_image:
|
||||
kwargs["end_image"] = load_image(request.end_image)
|
||||
|
||||
print(f"Generating video with {kwargs=}", file=sys.stderr)
|
||||
|
||||
# Generate video frames based on pipeline type
|
||||
if self.PipelineType == "WanPipeline":
|
||||
# WAN2.2 text-to-video generation
|
||||
output = self.pipe(**kwargs)
|
||||
frames = output.frames[0] # WAN2.2 returns frames in this format
|
||||
elif self.PipelineType == "WanImageToVideoPipeline":
|
||||
# WAN2.2 image-to-video generation
|
||||
if request.start_image:
|
||||
# Load and resize the input image according to WAN2.2 requirements
|
||||
image = load_image(request.start_image)
|
||||
# Use request dimensions or defaults, but respect WAN2.2 constraints
|
||||
request_height = request.height if request.height > 0 else 480
|
||||
request_width = request.width if request.width > 0 else 832
|
||||
max_area = request_height * request_width
|
||||
aspect_ratio = image.height / image.width
|
||||
mod_value = self.pipe.vae_scale_factor_spatial * self.pipe.transformer.config.patch_size[1]
|
||||
height = round((max_area * aspect_ratio) ** 0.5 / mod_value) * mod_value
|
||||
width = round((max_area / aspect_ratio) ** 0.5 / mod_value) * mod_value
|
||||
image = image.resize((width, height))
|
||||
kwargs["image"] = image
|
||||
kwargs["height"] = height
|
||||
kwargs["width"] = width
|
||||
|
||||
output = self.pipe(**kwargs)
|
||||
frames = output.frames[0]
|
||||
elif self.img2vid:
|
||||
# Generic image-to-video generation
|
||||
if request.start_image:
|
||||
image = load_image(request.start_image)
|
||||
image = image.resize((request.width if request.width > 0 else 1024,
|
||||
request.height if request.height > 0 else 576))
|
||||
kwargs["image"] = image
|
||||
|
||||
output = self.pipe(**kwargs)
|
||||
frames = output.frames[0]
|
||||
elif self.txt2vid:
|
||||
# Generic text-to-video generation
|
||||
output = self.pipe(**kwargs)
|
||||
frames = output.frames[0]
|
||||
else:
|
||||
return backend_pb2.Result(success=False, message=f"Pipeline {self.PipelineType} does not support video generation")
|
||||
|
||||
# Export video
|
||||
export_to_video(frames, request.dst, fps=fps)
|
||||
|
||||
return backend_pb2.Result(message="Video generated successfully", success=True)
|
||||
|
||||
except Exception as err:
|
||||
print(f"Error generating video: {err}", file=sys.stderr)
|
||||
traceback.print_exc()
|
||||
return backend_pb2.Result(success=False, message=f"Error generating video: {err}")
|
||||
|
||||
|
||||
def serve(address):
|
||||
server = grpc.server(futures.ThreadPoolExecutor(max_workers=MAX_WORKERS),
|
||||
|
||||
@@ -7,7 +7,7 @@ import (
|
||||
model "github.com/mudler/LocalAI/pkg/model"
|
||||
)
|
||||
|
||||
func VideoGeneration(height, width int32, prompt, startImage, endImage, dst string, loader *model.ModelLoader, modelConfig config.ModelConfig, appConfig *config.ApplicationConfig) (func() error, error) {
|
||||
func VideoGeneration(height, width int32, prompt, negativePrompt, startImage, endImage, dst string, numFrames, fps, seed int32, cfgScale float32, step int32, loader *model.ModelLoader, modelConfig config.ModelConfig, appConfig *config.ApplicationConfig) (func() error, error) {
|
||||
|
||||
opts := ModelOptions(modelConfig, appConfig)
|
||||
inferenceModel, err := loader.Load(
|
||||
@@ -22,12 +22,18 @@ func VideoGeneration(height, width int32, prompt, startImage, endImage, dst stri
|
||||
_, err := inferenceModel.GenerateVideo(
|
||||
appConfig.Context,
|
||||
&proto.GenerateVideoRequest{
|
||||
Height: height,
|
||||
Width: width,
|
||||
Prompt: prompt,
|
||||
StartImage: startImage,
|
||||
EndImage: endImage,
|
||||
Dst: dst,
|
||||
Height: height,
|
||||
Width: width,
|
||||
Prompt: prompt,
|
||||
NegativePrompt: negativePrompt,
|
||||
StartImage: startImage,
|
||||
EndImage: endImage,
|
||||
NumFrames: numFrames,
|
||||
Fps: fps,
|
||||
Seed: seed,
|
||||
CfgScale: cfgScale,
|
||||
Step: step,
|
||||
Dst: dst,
|
||||
})
|
||||
return err
|
||||
}
|
||||
|
||||
@@ -61,7 +61,7 @@ func downloadFile(url string) (string, error) {
|
||||
*/
|
||||
// VideoEndpoint
|
||||
// @Summary Creates a video given a prompt.
|
||||
// @Param request body schema.OpenAIRequest true "query params"
|
||||
// @Param request body schema.VideoRequest true "query params"
|
||||
// @Success 200 {object} schema.OpenAIResponse "Response"
|
||||
// @Router /video [post]
|
||||
func VideoEndpoint(cl *config.ModelConfigLoader, ml *model.ModelLoader, appConfig *config.ApplicationConfig) func(c *fiber.Ctx) error {
|
||||
@@ -166,7 +166,23 @@ func VideoEndpoint(cl *config.ModelConfigLoader, ml *model.ModelLoader, appConfi
|
||||
|
||||
baseURL := c.BaseURL()
|
||||
|
||||
fn, err := backend.VideoGeneration(height, width, input.Prompt, src, input.EndImage, output, ml, *config, appConfig)
|
||||
fn, err := backend.VideoGeneration(
|
||||
height,
|
||||
width,
|
||||
input.Prompt,
|
||||
input.NegativePrompt,
|
||||
src,
|
||||
input.EndImage,
|
||||
output,
|
||||
input.NumFrames,
|
||||
input.FPS,
|
||||
input.Seed,
|
||||
input.CFGScale,
|
||||
input.Step,
|
||||
ml,
|
||||
*config,
|
||||
appConfig,
|
||||
)
|
||||
if err != nil {
|
||||
return err
|
||||
}
|
||||
|
||||
@@ -28,6 +28,7 @@ type GalleryResponse struct {
|
||||
type VideoRequest struct {
|
||||
BasicModelRequest
|
||||
Prompt string `json:"prompt" yaml:"prompt"`
|
||||
NegativePrompt string `json:"negative_prompt" yaml:"negative_prompt"`
|
||||
StartImage string `json:"start_image" yaml:"start_image"`
|
||||
EndImage string `json:"end_image" yaml:"end_image"`
|
||||
Width int32 `json:"width" yaml:"width"`
|
||||
@@ -36,6 +37,7 @@ type VideoRequest struct {
|
||||
FPS int32 `json:"fps" yaml:"fps"`
|
||||
Seed int32 `json:"seed" yaml:"seed"`
|
||||
CFGScale float32 `json:"cfg_scale" yaml:"cfg_scale"`
|
||||
Step int32 `json:"step" yaml:"step"`
|
||||
ResponseFormat string `json:"response_format" yaml:"response_format"`
|
||||
}
|
||||
|
||||
|
||||
Reference in New Issue
Block a user