From 6b87353f1738b36e36939b7e01e7842527c8f88a Mon Sep 17 00:00:00 2001 From: bmaltais Date: Sun, 27 Nov 2022 16:07:26 -0500 Subject: [PATCH] Adding example for SDv2 --- ..._train_db_fixed_with-reg_SDv2 512 base.ps1 | 64 +++++++ v2_inference/v2-inpainting-inference.yaml | 158 ++++++++++++++++++ v2_inference/v2-midas-inference.yaml | 74 ++++++++ v2_inference/x4-upscaling.yaml | 76 +++++++++ 4 files changed, 372 insertions(+) create mode 100644 examples/kohya_train_db_fixed_with-reg_SDv2 512 base.ps1 create mode 100644 v2_inference/v2-inpainting-inference.yaml create mode 100644 v2_inference/v2-midas-inference.yaml create mode 100644 v2_inference/x4-upscaling.yaml diff --git a/examples/kohya_train_db_fixed_with-reg_SDv2 512 base.ps1 b/examples/kohya_train_db_fixed_with-reg_SDv2 512 base.ps1 new file mode 100644 index 0000000..80d82a3 --- /dev/null +++ b/examples/kohya_train_db_fixed_with-reg_SDv2 512 base.ps1 @@ -0,0 +1,64 @@ +# This powershell script will create a model using the fine tuning dreambooth method. It will require landscape, +# portrait and square images. +# +# Adjust the script to your own needs + +# variable values +$pretrained_model_name_or_path = "D:\models\512-base-ema.ckpt" +$data_dir = "D:\models\dariusz_zawadzki\kohya_reg\data" +$reg_data_dir = "D:\models\dariusz_zawadzki\kohya_reg\reg" +$logging_dir = "D:\models\dariusz_zawadzki\logs" +$output_dir = "D:\models\dariusz_zawadzki\train_db_fixed_model_reg_v2" +$resolution = "512,512" +$lr_scheduler="polynomial" +$cache_latents = 1 # 1 = true, 0 = false + +$image_num = Get-ChildItem $data_dir -Recurse -File -Include *.png, *.jpg, *.webp | Measure-Object | %{$_.Count} + +Write-Output "image_num: $image_num" + +$dataset_repeats = 200 +$learning_rate = 2e-6 +$train_batch_size = 4 +$epoch = 1 +$save_every_n_epochs=1 +$mixed_precision="bf16" +$num_cpu_threads_per_process=6 + +# You should not have to change values past this point +if ($cache_latents -eq 1) { + $cache_latents_value="--cache_latents" +} +else { + $cache_latents_value="" +} + +$repeats = $image_num * $dataset_repeats +$mts = [Math]::Ceiling($repeats / $train_batch_size * $epoch) + +Write-Output "Repeats: $repeats" + +cd D:\kohya_ss +.\venv\Scripts\activate + +accelerate launch --num_cpu_threads_per_process $num_cpu_threads_per_process train_db_fixed.py ` + --v2 ` + --pretrained_model_name_or_path=$pretrained_model_name_or_path ` + --train_data_dir=$data_dir ` + --output_dir=$output_dir ` + --resolution=$resolution ` + --train_batch_size=$train_batch_size ` + --learning_rate=$learning_rate ` + --max_train_steps=$mts ` + --use_8bit_adam ` + --xformers ` + --mixed_precision=$mixed_precision ` + $cache_latents_value ` + --save_every_n_epochs=$save_every_n_epochs ` + --logging_dir=$logging_dir ` + --save_precision="fp16" ` + --reg_data_dir=$reg_data_dir ` + --seed=494481440 ` + --lr_scheduler=$lr_scheduler + +# Add the inference yaml file along with the model for proper loading. Need to have the same name as model... Most likelly "last.yaml" in our case. diff --git a/v2_inference/v2-inpainting-inference.yaml b/v2_inference/v2-inpainting-inference.yaml new file mode 100644 index 0000000..32a9471 --- /dev/null +++ b/v2_inference/v2-inpainting-inference.yaml @@ -0,0 +1,158 @@ +model: + base_learning_rate: 5.0e-05 + target: ldm.models.diffusion.ddpm.LatentInpaintDiffusion + params: + linear_start: 0.00085 + linear_end: 0.0120 + num_timesteps_cond: 1 + log_every_t: 200 + timesteps: 1000 + first_stage_key: "jpg" + cond_stage_key: "txt" + image_size: 64 + channels: 4 + cond_stage_trainable: false + conditioning_key: hybrid + scale_factor: 0.18215 + monitor: val/loss_simple_ema + finetune_keys: null + use_ema: False + + unet_config: + target: ldm.modules.diffusionmodules.openaimodel.UNetModel + params: + use_checkpoint: True + image_size: 32 # unused + in_channels: 9 + out_channels: 4 + model_channels: 320 + attention_resolutions: [ 4, 2, 1 ] + num_res_blocks: 2 + channel_mult: [ 1, 2, 4, 4 ] + num_head_channels: 64 # need to fix for flash-attn + use_spatial_transformer: True + use_linear_in_transformer: True + transformer_depth: 1 + context_dim: 1024 + legacy: False + + first_stage_config: + target: ldm.models.autoencoder.AutoencoderKL + params: + embed_dim: 4 + monitor: val/rec_loss + ddconfig: + #attn_type: "vanilla-xformers" + double_z: true + z_channels: 4 + resolution: 256 + in_channels: 3 + out_ch: 3 + ch: 128 + ch_mult: + - 1 + - 2 + - 4 + - 4 + num_res_blocks: 2 + attn_resolutions: [ ] + dropout: 0.0 + lossconfig: + target: torch.nn.Identity + + cond_stage_config: + target: ldm.modules.encoders.modules.FrozenOpenCLIPEmbedder + params: + freeze: True + layer: "penultimate" + + +data: + target: ldm.data.laion.WebDataModuleFromConfig + params: + tar_base: null # for concat as in LAION-A + p_unsafe_threshold: 0.1 + filter_word_list: "data/filters.yaml" + max_pwatermark: 0.45 + batch_size: 8 + num_workers: 6 + multinode: True + min_size: 512 + train: + shards: + - "pipe:aws s3 cp s3://stability-aws/laion-a-native/part-0/{00000..18699}.tar -" + - "pipe:aws s3 cp s3://stability-aws/laion-a-native/part-1/{00000..18699}.tar -" + - "pipe:aws s3 cp s3://stability-aws/laion-a-native/part-2/{00000..18699}.tar -" + - "pipe:aws s3 cp s3://stability-aws/laion-a-native/part-3/{00000..18699}.tar -" + - "pipe:aws s3 cp s3://stability-aws/laion-a-native/part-4/{00000..18699}.tar -" #{00000-94333}.tar" + shuffle: 10000 + image_key: jpg + image_transforms: + - target: torchvision.transforms.Resize + params: + size: 512 + interpolation: 3 + - target: torchvision.transforms.RandomCrop + params: + size: 512 + postprocess: + target: ldm.data.laion.AddMask + params: + mode: "512train-large" + p_drop: 0.25 + # NOTE use enough shards to avoid empty validation loops in workers + validation: + shards: + - "pipe:aws s3 cp s3://deep-floyd-s3/datasets/laion_cleaned-part5/{93001..94333}.tar - " + shuffle: 0 + image_key: jpg + image_transforms: + - target: torchvision.transforms.Resize + params: + size: 512 + interpolation: 3 + - target: torchvision.transforms.CenterCrop + params: + size: 512 + postprocess: + target: ldm.data.laion.AddMask + params: + mode: "512train-large" + p_drop: 0.25 + +lightning: + find_unused_parameters: True + modelcheckpoint: + params: + every_n_train_steps: 5000 + + callbacks: + metrics_over_trainsteps_checkpoint: + params: + every_n_train_steps: 10000 + + image_logger: + target: main.ImageLogger + params: + enable_autocast: False + disabled: False + batch_frequency: 1000 + max_images: 4 + increase_log_steps: False + log_first_step: False + log_images_kwargs: + use_ema_scope: False + inpaint: False + plot_progressive_rows: False + plot_diffusion_rows: False + N: 4 + unconditional_guidance_scale: 5.0 + unconditional_guidance_label: [""] + ddim_steps: 50 # todo check these out for depth2img, + ddim_eta: 0.0 # todo check these out for depth2img, + + trainer: + benchmark: True + val_check_interval: 5000000 + num_sanity_val_steps: 0 + accumulate_grad_batches: 1 diff --git a/v2_inference/v2-midas-inference.yaml b/v2_inference/v2-midas-inference.yaml new file mode 100644 index 0000000..f20c30f --- /dev/null +++ b/v2_inference/v2-midas-inference.yaml @@ -0,0 +1,74 @@ +model: + base_learning_rate: 5.0e-07 + target: ldm.models.diffusion.ddpm.LatentDepth2ImageDiffusion + params: + linear_start: 0.00085 + linear_end: 0.0120 + num_timesteps_cond: 1 + log_every_t: 200 + timesteps: 1000 + first_stage_key: "jpg" + cond_stage_key: "txt" + image_size: 64 + channels: 4 + cond_stage_trainable: false + conditioning_key: hybrid + scale_factor: 0.18215 + monitor: val/loss_simple_ema + finetune_keys: null + use_ema: False + + depth_stage_config: + target: ldm.modules.midas.api.MiDaSInference + params: + model_type: "dpt_hybrid" + + unet_config: + target: ldm.modules.diffusionmodules.openaimodel.UNetModel + params: + use_checkpoint: True + image_size: 32 # unused + in_channels: 5 + out_channels: 4 + model_channels: 320 + attention_resolutions: [ 4, 2, 1 ] + num_res_blocks: 2 + channel_mult: [ 1, 2, 4, 4 ] + num_head_channels: 64 # need to fix for flash-attn + use_spatial_transformer: True + use_linear_in_transformer: True + transformer_depth: 1 + context_dim: 1024 + legacy: False + + first_stage_config: + target: ldm.models.autoencoder.AutoencoderKL + params: + embed_dim: 4 + monitor: val/rec_loss + ddconfig: + #attn_type: "vanilla-xformers" + double_z: true + z_channels: 4 + resolution: 256 + in_channels: 3 + out_ch: 3 + ch: 128 + ch_mult: + - 1 + - 2 + - 4 + - 4 + num_res_blocks: 2 + attn_resolutions: [ ] + dropout: 0.0 + lossconfig: + target: torch.nn.Identity + + cond_stage_config: + target: ldm.modules.encoders.modules.FrozenOpenCLIPEmbedder + params: + freeze: True + layer: "penultimate" + + diff --git a/v2_inference/x4-upscaling.yaml b/v2_inference/x4-upscaling.yaml new file mode 100644 index 0000000..2db0964 --- /dev/null +++ b/v2_inference/x4-upscaling.yaml @@ -0,0 +1,76 @@ +model: + base_learning_rate: 1.0e-04 + target: ldm.models.diffusion.ddpm.LatentUpscaleDiffusion + params: + parameterization: "v" + low_scale_key: "lr" + linear_start: 0.0001 + linear_end: 0.02 + num_timesteps_cond: 1 + log_every_t: 200 + timesteps: 1000 + first_stage_key: "jpg" + cond_stage_key: "txt" + image_size: 128 + channels: 4 + cond_stage_trainable: false + conditioning_key: "hybrid-adm" + monitor: val/loss_simple_ema + scale_factor: 0.08333 + use_ema: False + + low_scale_config: + target: ldm.modules.diffusionmodules.upscaling.ImageConcatWithNoiseAugmentation + params: + noise_schedule_config: # image space + linear_start: 0.0001 + linear_end: 0.02 + max_noise_level: 350 + + unet_config: + target: ldm.modules.diffusionmodules.openaimodel.UNetModel + params: + use_checkpoint: True + num_classes: 1000 # timesteps for noise conditioning (here constant, just need one) + image_size: 128 + in_channels: 7 + out_channels: 4 + model_channels: 256 + attention_resolutions: [ 2,4,8] + num_res_blocks: 2 + channel_mult: [ 1, 2, 2, 4] + disable_self_attentions: [True, True, True, False] + disable_middle_self_attn: False + num_heads: 8 + use_spatial_transformer: True + transformer_depth: 1 + context_dim: 1024 + legacy: False + use_linear_in_transformer: True + + first_stage_config: + target: ldm.models.autoencoder.AutoencoderKL + params: + embed_dim: 4 + ddconfig: + # attn_type: "vanilla-xformers" this model needs efficient attention to be feasible on HR data, also the decoder seems to break in half precision (UNet is fine though) + double_z: True + z_channels: 4 + resolution: 256 + in_channels: 3 + out_ch: 3 + ch: 128 + ch_mult: [ 1,2,4 ] # num_down = len(ch_mult)-1 + num_res_blocks: 2 + attn_resolutions: [ ] + dropout: 0.0 + + lossconfig: + target: torch.nn.Identity + + cond_stage_config: + target: ldm.modules.encoders.modules.FrozenOpenCLIPEmbedder + params: + freeze: True + layer: "penultimate" +