Update train_db_fixed to v9

2022-11-19 08:49:42 -05:00 · 2022-11-19 08:49:42 -05:00 · 0e8b993def
commit 0e8b993def
parent f56340d53e
7 changed files with 593 additions and 146 deletions
--- a/README.md
+++ b/README.md
@ -113,9 +113,10 @@ accelerate launch --num_cpu_threads_per_process 6 train_db_fixed-ber.py `
    --cache_latents `
    --save_every_n_epochs=1 `
    --fine_tuning `
+    --enable_bucket `
    --dataset_repeats=200 `
    --seed=23 `
-    --save_half
+    ---save_precision="fp16"
 ```

 Refer to this url for more details about finetuning: https://note.com/kohya_ss/n/n1269f1e1a54e
@ -125,7 +126,12 @@ Refer to this url for more details about finetuning: https://note.com/kohya_ss/n
 * 11/7 (v7): Text Encoder supports checkpoint files in different storage formats (it is converted at the time of import, so export will be in normal format). Changed the average value of EPOCH loss to output to the screen. Added a function to save epoch and global step in checkpoint in SD format (add values if there is existing data). The reg_data_dir option is enabled during fine tuning (fine tuning while mixing regularized images). Added dataset_repeats option that is valid for fine tuning (specified when the number of teacher images is small and the epoch is extremely short).
 * 11/9 (v8): supports Diffusers 0.7.2. To upgrade diffusers run `pip install --upgrade diffusers[torch]`
 * 11/14 (diffusers_fine_tuning v2):
- script name is now fine_tune.py.
- Added option to learn Text Encoder --train_text_encoder.
- The data format of checkpoint at the time of saving can be specified with the --save_precision option. You can choose float, fp16, and bf16.
- Added a --save_state option to save the learning state (optimizer, etc.) in the middle. It can be resumed with the --resume option.
+    - script name is now fine_tune.py.
+    - Added option to learn Text Encoder --train_text_encoder.
+    - The data format of checkpoint at the time of saving can be specified with the --save_precision option. You can choose float, fp16, and bf16.
+    - Added a --save_state option to save the learning state (optimizer, etc.) in the middle. It can be resumed with the --resume option.
+* 11/18 (v9):
+    - Added support for Aspect Ratio Bucketing (enable_bucket option). (--enable_bucket)
+    - Added support for selecting data format (fp16/bf16/float) when saving checkpoint (--save_precision)
+    - Added support for saving learning state (--save_state, --resume)
+    - Added support for logging (--logging_dir)
--- a/examples/caption.ps1
+++ b/examples/caption.ps1
@ -2,9 +2,12 @@
 #
 # Usefull to create base caption that will be augmented on a per image basis

-$folder = "D:\dreambooth\train_sylvia_ritter\raw_data\all-images\"
+$folder = "D:\some\folder\location\"
 $file_pattern="*.*"
-$text_fir_file="a digital painting of xxx, by silvery trait"
+$caption_text="some caption text"

-$files = Get-ChildItem $folder$file_pattern
-foreach ($file in $files) {New-Item -ItemType file -Path $folder -Name "$($file.BaseName).txt" -Value $text_fir_file}
+$files = Get-ChildItem $folder$file_pattern -Include *.png,*.jpg,*.webp -File
+foreach ($file in $files) 
+{
+    New-Item -ItemType file -Path $folder -Name "$($file.BaseName).txt" -Value $caption_text
+}
--- a/examples/caption_subfolders.ps1
+++ b/examples/caption_subfolders.ps1
@ -0,0 +1,20 @@
+# This powershell script will create a text file for each files in the folder
+#
+# Usefull to create base caption that will be augmented on a per image basis
+
+$folder = "D:\test\t2\"
+$file_pattern="*.*"
+$text_fir_file="bigeyes style"
+
+foreach ($file in Get-ChildItem $folder\$file_pattern -File) 
+{
+    New-Item -ItemType file -Path $folder -Name "$($file.BaseName).txt" -Value $text_fir_file
+}
+
+foreach($directory in Get-ChildItem -path $folder -Directory)
+{
+    foreach ($file in Get-ChildItem $folder\$directory\$file_pattern) 
+    {
+        New-Item -ItemType file -Path $folder\$directory -Name "$($file.BaseName).txt" -Value $text_fir_file
+    }
+}
--- a/examples/kohya-1-folders.ps1
+++ b/examples/kohya-1-folders.ps1
@ -0,0 +1,87 @@
+# This powershell script will create a model using the fine tuning dreambooth method. It will require landscape,
+# portrait and square images.
+#
+# Adjust the script to your own needs
+
+# Sylvia Ritter
+# variable values
+$pretrained_model_name_or_path = "D:\models\v1-5-pruned-mse-vae.ckpt"
+$data_dir = "D:\test\squat"
+$train_dir = "D:\test\"
+$resolution = "512,512"
+
+$image_num = Get-ChildItem $data_dir -Recurse -File -Include *.png | Measure-Object | %{$_.Count}
+
+Write-Output "image_num: $image_num"
+
+$learning_rate = 1e-6
+$dataset_repeats = 40
+$train_batch_size = 8
+$epoch = 1
+$save_every_n_epochs=1
+$mixed_precision="fp16"
+$num_cpu_threads_per_process=6
+
+# You should not have to change values past this point
+
+$output_dir = $train_dir + "\model"
+$repeats = $image_num * $dataset_repeats
+$mts = [Math]::Ceiling($repeats / $train_batch_size * $epoch)
+
+Write-Output "Repeats: $repeats"
+
+.\venv\Scripts\activate
+
+accelerate launch --num_cpu_threads_per_process $num_cpu_threads_per_process train_db_fixed-ber.py `
+    --pretrained_model_name_or_path=$pretrained_model_name_or_path `
+    --train_data_dir=$data_dir `
+    --output_dir=$output_dir `
+    --resolution=$resolution `
+    --train_batch_size=$train_batch_size `
+    --learning_rate=$learning_rate `
+    --max_train_steps=$mts `
+    --use_8bit_adam `
+    --xformers `
+    --mixed_precision=$mixed_precision `
+    --cache_latents `
+    --save_every_n_epochs=$save_every_n_epochs `
+    --fine_tuning `
+    --dataset_repeats=$dataset_repeats `
+    --save_precision="fp16"
+    
+# 2nd pass at half the dataset repeat value
+
+accelerate launch --num_cpu_threads_per_process $num_cpu_threads_per_process train_db_fixed.py `
+    --pretrained_model_name_or_path=$output_dir"\last.ckpt" `
+    --train_data_dir=$data_dir `
+    --output_dir=$output_dir"2" `
+    --resolution=$resolution `
+    --train_batch_size=$train_batch_size `
+    --learning_rate=$learning_rate `
+    --max_train_steps=$([Math]::Ceiling($mts/2)) `
+    --use_8bit_adam `
+    --xformers `
+    --mixed_precision=$mixed_precision `
+    --cache_latents `
+    --save_every_n_epochs=$save_every_n_epochs `
+    --fine_tuning `
+    --dataset_repeats=$([Math]::Ceiling($dataset_repeats/2)) `
+    --save_precision="fp16"
+    
+    accelerate launch --num_cpu_threads_per_process $num_cpu_threads_per_process train_db_fixed-ber.py `
+    --pretrained_model_name_or_path=$output_dir"\last.ckpt" `
+    --train_data_dir=$data_dir `
+    --output_dir=$output_dir"2" `
+    --resolution=$resolution `
+    --train_batch_size=$train_batch_size `
+    --learning_rate=$learning_rate `
+    --max_train_steps=$mts `
+    --use_8bit_adam `
+    --xformers `
+    --mixed_precision=$mixed_precision `
+    --cache_latents `
+    --save_every_n_epochs=$save_every_n_epochs `
+    --fine_tuning `
+    --dataset_repeats=$dataset_repeats `
+    --save_precision="fp16"
+    
--- a/examples/kohya-3-folders.ps1
+++ b/examples/kohya-3-folders.ps1
@ -0,0 +1,154 @@
+# This powershell script will create a model using the fine tuning dreambooth method. It will require landscape,
+# portrait and square images.
+#
+# Adjust the script to your own needs
+
+# Sylvia Ritter
+# variable values
+$pretrained_model_name_or_path = "D:\models\v1-5-pruned-mse-vae.ckpt"
+$train_dir = "D:\dreambooth\train_sylvia_ritter\raw_data"
+
+$landscape_image_num = 4
+$portrait_image_num = 25
+$square_image_num = 2
+
+$learning_rate = 1e-6
+$dataset_repeats = 120
+$train_batch_size = 4
+$epoch = 1
+$save_every_n_epochs=1
+$mixed_precision="fp16"
+$num_cpu_threads_per_process=6
+
+$landscape_folder_name = "landscape-pp"
+$landscape_resolution = "832,512"
+$portrait_folder_name = "portrait-pp"
+$portrait_resolution = "448,896"
+$square_folder_name = "square-pp"
+$square_resolution = "512,512"
+
+# You should not have to change values past this point
+
+$landscape_data_dir = $train_dir + "\" + $landscape_folder_name
+$portrait_data_dir = $train_dir + "\" + $portrait_folder_name
+$square_data_dir = $train_dir + "\" + $square_folder_name
+$landscape_output_dir = $train_dir + "\model-l"
+$portrait_output_dir = $train_dir + "\model-lp"
+$square_output_dir = $train_dir + "\model-lps"
+
+$landscape_repeats = $landscape_image_num * $dataset_repeats
+$portrait_repeats = $portrait_image_num * $dataset_repeats
+$square_repeats = $square_image_num * $dataset_repeats
+
+$landscape_mts = [Math]::Ceiling($landscape_repeats / $train_batch_size * $epoch)
+$portrait_mts = [Math]::Ceiling($portrait_repeats / $train_batch_size * $epoch)
+$square_mts = [Math]::Ceiling($square_repeats / $train_batch_size * $epoch)
+
+# Write-Output $landscape_repeats
+
+.\venv\Scripts\activate
+
+accelerate launch --num_cpu_threads_per_process $num_cpu_threads_per_process train_db_fixed.py `
+    --pretrained_model_name_or_path=$pretrained_model_name_or_path `
+    --train_data_dir=$landscape_data_dir `
+    --output_dir=$landscape_output_dir `
+    --resolution=$landscape_resolution `
+    --train_batch_size=$train_batch_size `
+    --learning_rate=$learning_rate `
+    --max_train_steps=$landscape_mts `
+    --use_8bit_adam `
+    --xformers `
+    --mixed_precision=$mixed_precision `
+    --cache_latents `
+    --save_every_n_epochs=$save_every_n_epochs `
+    --fine_tuning `
+    --dataset_repeats=$dataset_repeats `
+    --save_precision="fp16"
+    
+accelerate launch --num_cpu_threads_per_process $num_cpu_threads_per_process train_db_fixed.py `
+    --pretrained_model_name_or_path=$landscape_output_dir"\last.ckpt" `
+    --train_data_dir=$portrait_data_dir `
+    --output_dir=$portrait_output_dir `
+    --resolution=$portrait_resolution `
+    --train_batch_size=$train_batch_size `
+    --learning_rate=$learning_rate `
+    --max_train_steps=$portrait_mts `
+    --use_8bit_adam `
+    --xformers `
+    --mixed_precision=$mixed_precision `
+    --cache_latents `
+    --save_every_n_epochs=$save_every_n_epochs `
+    --fine_tuning `
+    --dataset_repeats=$dataset_repeats `
+    --save_precision="fp16"
+    
+accelerate launch --num_cpu_threads_per_process $num_cpu_threads_per_process train_db_fixed.py `
+    --pretrained_model_name_or_path=$portrait_output_dir"\last.ckpt" `
+    --train_data_dir=$square_data_dir `
+    --output_dir=$square_output_dir `
+    --resolution=$square_resolution `
+    --train_batch_size=$train_batch_size `
+    --learning_rate=$learning_rate `
+    --max_train_steps=$square_mts `
+    --use_8bit_adam `
+    --xformers `
+    --mixed_precision=$mixed_precision `
+    --cache_latents `
+    --save_every_n_epochs=$save_every_n_epochs `
+    --fine_tuning `
+    --dataset_repeats=$dataset_repeats `
+    --save_precision="fp16"
+    
+# 2nd pass at half the dataset repeat value
+
+accelerate launch --num_cpu_threads_per_process $num_cpu_threads_per_process train_db_fixed.py `
+    --pretrained_model_name_or_path=$square_output_dir"\last.ckpt" `
+    --train_data_dir=$landscape_data_dir `
+    --output_dir=$landscape_output_dir"2" `
+    --resolution=$landscape_resolution `
+    --train_batch_size=$train_batch_size `
+    --learning_rate=$learning_rate `
+    --max_train_steps=$([Math]::Ceiling($landscape_mts/2)) `
+    --use_8bit_adam `
+    --xformers `
+    --mixed_precision=$mixed_precision `
+    --cache_latents `
+    --save_every_n_epochs=$save_every_n_epochs `
+    --fine_tuning `
+    --dataset_repeats=$([Math]::Ceiling($dataset_repeats/2)) `
+    --save_precision="fp16"
+    
+accelerate launch --num_cpu_threads_per_process $num_cpu_threads_per_process train_db_fixed.py `
+    --pretrained_model_name_or_path=$landscape_output_dir"2\last.ckpt" `
+    --train_data_dir=$portrait_data_dir `
+    --output_dir=$portrait_output_dir"2" `
+    --resolution=$portrait_resolution `
+    --train_batch_size=$train_batch_size `
+    --learning_rate=$learning_rate `
+    --max_train_steps=$([Math]::Ceiling($portrait_mts/2)) `
+    --use_8bit_adam `
+    --xformers `
+    --mixed_precision=$mixed_precision `
+    --cache_latents `
+    --save_every_n_epochs=$save_every_n_epochs `
+    --fine_tuning `
+    --dataset_repeats=$([Math]::Ceiling($dataset_repeats/2)) `
+    --save_precision="fp16"
+    
+accelerate launch --num_cpu_threads_per_process $num_cpu_threads_per_process train_db_fixed.py `
+    --pretrained_model_name_or_path=$portrait_output_dir"2\last.ckpt" `
+    --train_data_dir=$square_data_dir `
+    --output_dir=$square_output_dir"2" `
+    --resolution=$square_resolution `
+    --train_batch_size=$train_batch_size `
+    --learning_rate=$learning_rate `
+    --max_train_steps=$([Math]::Ceiling($square_mts/2)) `
+    --use_8bit_adam `
+    --xformers `
+    --mixed_precision=$mixed_precision `
+    --cache_latents `
+    --save_every_n_epochs=$save_every_n_epochs `
+    --fine_tuning `
+    --dataset_repeats=$([Math]::Ceiling($dataset_repeats/2)) `
+    --save_precision="fp16"
+    
--- a/examples/kohya_diffuser.ps1
+++ b/examples/kohya_diffuser.ps1
@ -55,7 +55,7 @@ accelerate launch --num_cpu_threads_per_process $num_cpu_threads_per_process D:\
    --use_8bit_adam --xformers `
    --mixed_precision=$mixed_precision `
    --save_every_n_epochs=$save_every_n_epochs `
-    --save_half
+    --save_precision="fp16"

 accelerate launch --num_cpu_threads_per_process $num_cpu_threads_per_process D:\kohya_ss\diffusers_fine_tuning\fine_tune.py `
    --pretrained_model_name_or_path=$train_dir"\fine_tuned\last.ckpt" `
@ -69,4 +69,4 @@ accelerate launch --num_cpu_threads_per_process $num_cpu_threads_per_process D:\
    --use_8bit_adam --xformers `
    --mixed_precision=$mixed_precision `
    --save_every_n_epochs=$save_every_n_epochs `
-    --save_half
+    --save_precision="fp16"
--- a/train_db_fixed.py
+++ b/train_db_fixed.py
@ -4,7 +4,9 @@
 # v7: another text encoder ckpt format, average loss, save epochs/global steps, show num of train/reg images,
 #     enable reg images in fine-tuning, add dataset_repeats option
 # v8: supports Diffusers 0.7.2
+# v9: add bucketing option

+import time
 from torch.autograd.function import Function
 import argparse
 import glob
@ -56,13 +58,40 @@ VAE_PARAMS_NUM_RES_BLOCKS = 2

 # checkpointファイル名
 LAST_CHECKPOINT_NAME = "last.ckpt"
+LAST_STATE_NAME = "last-state"
 EPOCH_CHECKPOINT_NAME = "epoch-{:06d}.ckpt"
+EPOCH_STATE_NAME = "epoch-{:06d}-state"
+
+
+def make_bucket_resolutions(max_reso, min_size=256, max_size=1024, divisible=64):
+  max_width, max_height = max_reso
+  max_area = (max_width // divisible) * (max_height // divisible)
+
+  resos = set()
+
+  size = int(math.sqrt(max_area)) * divisible
+  resos.add((size, size))
+
+  size = min_size
+  while size <= max_size:
+    width = size
+    height = min(max_size, (max_area // (width // divisible)) * divisible)
+    resos.add((width, height))
+    resos.add((height, width))
+    size += divisible
+
+  resos = list(resos)
+  resos.sort()
+
+  aspect_ratios = [w / h for w, h in resos]
+  return resos, aspect_ratios


 class DreamBoothOrFineTuningDataset(torch.utils.data.Dataset):
-  def __init__(self, fine_tuning, train_img_path_captions, reg_img_path_captions, tokenizer, resolution, prior_loss_weight, flip_aug, color_aug, face_crop_aug_range, random_crop, shuffle_caption, disable_padding, debug_dataset) -> None:
+  def __init__(self, batch_size, fine_tuning, train_img_path_captions, reg_img_path_captions, tokenizer, resolution, prior_loss_weight, flip_aug, color_aug, face_crop_aug_range, random_crop, shuffle_caption, disable_padding, debug_dataset) -> None:
    super().__init__()

+    self.batch_size = batch_size
    self.fine_tuning = fine_tuning
    self.train_img_path_captions = train_img_path_captions
    self.reg_img_path_captions = reg_img_path_captions
@ -76,6 +105,7 @@ class DreamBoothOrFineTuningDataset(torch.utils.data.Dataset):
    self.shuffle_caption = shuffle_caption
    self.disable_padding = disable_padding
    self.latents_cache = None
+    self.enable_bucket = False

    # augmentation
    flip_p = 0.5 if flip_aug else 0.0
@ -102,13 +132,8 @@ class DreamBoothOrFineTuningDataset(torch.utils.data.Dataset):

    self.enable_reg_images = self.num_reg_images > 0

-    if not self.enable_reg_images:
-      self._length = self.num_train_images
-    else:
-      # 学習データの倍として、奇数ならtrain
-      self._length = self.num_train_images * 2
-      if self._length // 2 < self.num_reg_images:
-        print("some of reg images are not used / 正則化画像の数が多いので、一部使用されない正則化画像があります")
+    if self.enable_reg_images and self.num_train_images < self.num_reg_images:
+      print("some of reg images are not used / 正則化画像の数が多いので、一部使用されない正則化画像があります")

    self.image_transforms = transforms.Compose(
        [
@ -117,6 +142,132 @@ class DreamBoothOrFineTuningDataset(torch.utils.data.Dataset):
        ]
    )

+  # bucketingを行わない場合も呼び出し必須（ひとつだけbucketを作る）
+  def make_buckets_with_caching(self, enable_bucket, vae):
+    self.enable_bucket = enable_bucket
+
+    cache_latents = vae is not None
+    if cache_latents:
+      if enable_bucket:
+        print("cache latents with bucketing")
+      else:
+        print("cache latents")
+    else:
+      if enable_bucket:
+        print("make buckets")
+      else:
+        print("prepare dataset")
+
+    # bucketingを用意する
+    if enable_bucket:
+      bucket_resos, bucket_aspect_ratios = make_bucket_resolutions((self.width, self.height))
+    else:
+      # bucketはひとつだけ、すべての画像は同じ解像度
+      bucket_resos = [(self.width, self.height)]
+      bucket_aspect_ratios = [self.width / self.height]
+    bucket_aspect_ratios = np.array(bucket_aspect_ratios)
+
+    # 画像の解像度、latentをあらかじめ取得する
+    img_ar_errors = []
+    self.size_lat_cache = {}
+    for image_path, _ in tqdm(self.train_img_path_captions + self.reg_img_path_captions):
+      if image_path in self.size_lat_cache:
+        continue
+
+      image = self.load_image(image_path)[0]
+      image_height, image_width = image.shape[0:2]
+
+      if not enable_bucket:
+        # assert image_width == self.width and image_height == self.height, \
+        #     f"all images must have specific resolution when bucketing is disabled / bucketを使わない場合、すべての画像のサイズを統一してください: {image_path}"
+        reso = (self.width, self.height)
+      else:
+        # bucketを決める
+        aspect_ratio = image_width / image_height
+        ar_errors = bucket_aspect_ratios - aspect_ratio
+        bucket_id = np.abs(ar_errors).argmin()
+        reso = bucket_resos[bucket_id]
+        ar_error = ar_errors[bucket_id]
+        img_ar_errors.append(ar_error)
+
+        if cache_latents:
+          image = self.resize_and_trim(image, reso)
+
+      # latentを取得する
+      if cache_latents:
+        img_tensor = self.image_transforms(image)
+        img_tensor = img_tensor.unsqueeze(0).to(device=vae.device, dtype=vae.dtype)
+        latents = vae.encode(img_tensor).latent_dist.sample().squeeze(0).to("cpu")
+      else:
+        latents = None
+
+      self.size_lat_cache[image_path] = (reso, latents)
+
+    # 画像をbucketに分割する
+    self.buckets = [[] for _ in range(len(bucket_resos))]
+    reso_to_index = {}
+    for i, reso in enumerate(bucket_resos):
+      reso_to_index[reso] = i
+
+    def split_to_buckets(is_reg, img_path_captions):
+      for image_path, caption in img_path_captions:
+        reso, _ = self.size_lat_cache[image_path]
+        bucket_index = reso_to_index[reso]
+        self.buckets[bucket_index].append((is_reg, image_path, caption))
+
+    split_to_buckets(False, self.train_img_path_captions)
+
+    if self.enable_reg_images:
+      l = []
+      while len(l) < len(self.train_img_path_captions):
+        l += self.reg_img_path_captions
+      l = l[:len(self.train_img_path_captions)]
+      split_to_buckets(True, l)
+
+    if enable_bucket:
+      print("number of images with repeats / 繰り返し回数込みの各bucketの画像枚数")
+      for i, (reso, imgs) in enumerate(zip(bucket_resos, self.buckets)):
+        print(f"bucket {i}: resolution {reso}, count: {len(imgs)}")
+      img_ar_errors = np.array(img_ar_errors)
+      print(f"mean ar error: {np.mean(np.abs(img_ar_errors))}")
+
+    # 参照用indexを作る
+    self.buckets_indices = []
+    for bucket_index, bucket in enumerate(self.buckets):
+      batch_count = int(math.ceil(len(bucket) / self.batch_size))
+      for batch_index in range(batch_count):
+        self.buckets_indices.append((bucket_index, batch_index))
+
+    self.shuffle_buckets()
+    self._length = len(self.buckets_indices)
+
+  # どのサイズにリサイズするか→トリミングする方向で
+  def resize_and_trim(self, image, reso):
+    image_height, image_width = image.shape[0:2]
+    ar_img = image_width / image_height
+    ar_reso = reso[0] / reso[1]
+    if ar_img > ar_reso:                   # 横が長い→縦を合わせる
+      scale = reso[1] / image_height
+    else:
+      scale = reso[0] / image_width
+    resized_size = (int(image_width * scale + .5), int(image_height * scale + .5))
+
+    image = cv2.resize(image, resized_size, interpolation=cv2.INTER_AREA)       # INTER_AREAでやりたいのでcv2でリサイズ
+    if resized_size[0] > reso[0]:
+      trim_size = resized_size[0] - reso[0]
+      image = image[:, trim_size//2:trim_size//2 + reso[0]]
+    elif resized_size[1] > reso[1]:
+      trim_size = resized_size[1] - reso[1]
+      image = image[trim_size//2:trim_size//2 + reso[1]]
+    assert image.shape[0] == reso[1] and image.shape[1] == reso[0],  \
+        f"internal error, illegal trimmed size: {image.shape}, {reso}"
+    return image
+
+  def shuffle_buckets(self):
+    random.shuffle(self.buckets_indices)
+    for bucket in self.buckets:
+      random.shuffle(bucket)
+
  def load_image(self, image_path):
    image = Image.open(image_path)
    if not image.mode == "RGB":
@ -184,83 +335,85 @@ class DreamBoothOrFineTuningDataset(torch.utils.data.Dataset):
  def __len__(self):
    return self._length

-  def set_cached_latents(self, image_path, latents):
-    if self.latents_cache is None:
-      self.latents_cache = {}
-    self.latents_cache[image_path] = latents
+  def __getitem__(self, index):
+    if index == 0:
+      self.shuffle_buckets()

-  def __getitem__(self, index_arg):
-    example = {}
+    bucket = self.buckets[self.buckets_indices[index][0]]
+    image_index = self.buckets_indices[index][1] * self.batch_size

-    if not self.enable_reg_images:
-      index = index_arg
-      img_path_captions = self.train_img_path_captions
-      reg = False
-    else:
-      # 偶数ならtrain、奇数ならregを返す
-      if index_arg % 2 == 0:
-        img_path_captions = self.train_img_path_captions
-        reg = False
+    latents_list = []
+    images = []
+    captions = []
+    loss_weights = []
+
+    for is_reg, image_path, caption in bucket[image_index:image_index + self.batch_size]:
+      loss_weights.append(1.0 if is_reg else self.prior_loss_weight)
+
+      # image/latentsを処理する
+      reso, latents = self.size_lat_cache[image_path]
+
+      if latents is None:
+        # 画像を読み込み必要ならcropする
+        img, face_cx, face_cy, face_w, face_h = self.load_image(image_path)
+        im_h, im_w = img.shape[0:2]
+
+        if self.enable_bucket:
+          img = self.resize_and_trim(img, reso)
+        else:
+          if face_cx > 0:                   # 顔位置情報あり
+            img = self.crop_target(img, face_cx, face_cy, face_w, face_h)
+          elif im_h > self.height or im_w > self.width:
+            assert self.random_crop, f"image too large, and face_crop_aug_range and random_crop are disabled / 画像サイズが大きいのでface_crop_aug_rangeかrandom_cropを有効にしてください"
+            if im_h > self.height:
+              p = random.randint(0, im_h - self.height)
+              img = img[p:p + self.height]
+            if im_w > self.width:
+              p = random.randint(0, im_w - self.width)
+              img = img[:, p:p + self.width]
+
+          im_h, im_w = img.shape[0:2]
+          assert im_h == self.height and im_w == self.width, f"image size is small / 画像サイズが小さいようです: {image_path}"
+
+        # augmentation
+        if self.aug is not None:
+          img = self.aug(image=img)['image']
+
+        image = self.image_transforms(img)      # -1.0~1.0のtorch.Tensorになる
      else:
-        img_path_captions = self.reg_img_path_captions
-        reg = True
-      index = index_arg // 2
-    example['loss_weight'] = 1.0 if (not reg or self.fine_tuning) else self.prior_loss_weight
+        image = None

-    index = index % len(img_path_captions)
-    image_path, caption = img_path_captions[index]
-    example['image_path'] = image_path
+      images.append(image)
+      latents_list.append(latents)

-    # image/latentsを処理する
-    if self.latents_cache is not None and image_path in self.latents_cache:
-      # latentsはキャッシュ済み
-      example['latents'] = self.latents_cache[image_path]
-    else:
-      # 画像を読み込み必要ならcropする
-      img, face_cx, face_cy, face_w, face_h = self.load_image(image_path)
-      im_h, im_w = img.shape[0:2]
-      if face_cx > 0:                   # 顔位置情報あり
-        img = self.crop_target(img, face_cx, face_cy, face_w, face_h)
-      elif im_h > self.height or im_w > self.width:
-        assert self.random_crop, f"image too large, and face_crop_aug_range and random_crop are disabled / 画像サイズが大きいのでface_crop_aug_rangeかrandom_cropを有効にしてください"
-        if im_h > self.height:
-          p = random.randint(0, im_h - self.height)
-          img = img[p:p + self.height]
-        if im_w > self.width:
-          p = random.randint(0, im_w - self.width)
-          img = img[:, p:p + self.width]
+      # captionを処理する
+      if self.fine_tuning and self.shuffle_caption:         # fine tuning時にcaptionのshuffleをする
+        tokens = caption.strip().split(",")
+        random.shuffle(tokens)
+        caption = ",".join(tokens).strip()
+      captions.append(caption)

-      im_h, im_w = img.shape[0:2]
-      assert im_h == self.height and im_w == self.width, f"image too small / 画像サイズが小さいようです: {image_path}"
-
-      # augmentation
-      if self.aug is not None:
-        img = self.aug(image=img)['image']
-
-      example['image'] = self.image_transforms(img)      # -1.0~1.0のtorch.Tensorになる
-
-    # captionを処理する
-    if self.fine_tuning and self.shuffle_caption:         # fine tuning時にcaptionのshuffleをする
-      tokens = caption.strip().split(",")
-      random.shuffle(tokens)
-      caption = ",".join(tokens).strip()
-
-    input_ids = self.tokenizer(caption, padding="do_not_pad", truncation=True,
-                               max_length=self.tokenizer.model_max_length).input_ids
-
-    # padしてTensor変換
+    # input_idsをpadしてTensor変換
    if self.disable_padding:
      # paddingしない：padding==Trueはバッチの中の最大長に合わせるだけ（やはりバグでは……？）
-      input_ids = self.tokenizer.pad({"input_ids": input_ids}, padding=True, return_tensors="pt").input_ids
+      input_ids = self.tokenizer(captions, padding=True, truncation=True, return_tensors="pt").input_ids
    else:
      # paddingする
-      input_ids = self.tokenizer.pad({"input_ids": input_ids}, padding='max_length', max_length=self.tokenizer.model_max_length,
-                                     return_tensors='pt').input_ids
-
+      input_ids = self.tokenizer(captions, padding='max_length', truncation=True, return_tensors='pt').input_ids
+    
+    example = {}
+    example['loss_weights'] = torch.FloatTensor(loss_weights)
    example['input_ids'] = input_ids
-
+    if images[0] is not None:
+      images = torch.stack(images)
+      images = images.to(memory_format=torch.contiguous_format).float()
+    else:
+      images = None
+    example['images'] = images
+    example['latents'] = torch.stack(latents_list) if latents_list[0] is not None else None
    if self.debug_dataset:
-      example['caption'] = caption
+      example['image_paths'] = [image_path for _, image_path, _ in bucket[image_index:image_index + self.batch_size]]
+      example['captions'] = captions
    return example


@ -916,7 +1069,7 @@ def load_models_from_stable_diffusion_checkpoint(ckpt_path):
  return text_model, vae, unet


-def save_stable_diffusion_checkpoint(output_file, text_encoder, unet, ckpt_path, epochs, steps):
+def save_stable_diffusion_checkpoint(output_file, text_encoder, unet, ckpt_path, epochs, steps, save_dtype=None):
  # VAEがメモリ上にないので、もう一度VAEを含めて読み込む
  checkpoint = load_checkpoint_with_conversion(ckpt_path)
  state_dict = checkpoint["state_dict"]
@ -926,6 +1079,8 @@ def save_stable_diffusion_checkpoint(output_file, text_encoder, unet, ckpt_path,
  for k, v in unet_state_dict.items():
    key = "model.diffusion_model." + k
    assert key in state_dict, f"Illegal key in save SD: {key}"
+    if save_dtype is not None:
+      v = v.detach().clone().to("cpu").to(save_dtype)
    state_dict[key] = v

  # Convert the text encoder model
@ -933,6 +1088,8 @@ def save_stable_diffusion_checkpoint(output_file, text_encoder, unet, ckpt_path,
  for k, v in text_enc_dict.items():
    key = "cond_stage_model.transformer." + k
    assert key in state_dict, f"Illegal key in save SD: {key}"
+    if save_dtype is not None:
+      v = v.detach().clone().to("cpu").to(save_dtype)
    state_dict[key] = v

  # Put together new checkpoint
@ -951,24 +1108,7 @@ def save_stable_diffusion_checkpoint(output_file, text_encoder, unet, ckpt_path,


 def collate_fn(examples):
-  input_ids = [e['input_ids'] for e in examples]
-  input_ids = torch.stack(input_ids)
-
-  if 'latents' in examples[0]:
-    pixel_values = None
-    latents = [e['latents'] for e in examples]
-    latents = torch.stack(latents)
-  else:
-    pixel_values = [e['image'] for e in examples]
-    pixel_values = torch.stack(pixel_values)
-    pixel_values = pixel_values.to(memory_format=torch.contiguous_format).float()
-    latents = None
-
-  loss_weights = [e['loss_weight'] for e in examples]
-  loss_weights = torch.FloatTensor(loss_weights)
-
-  batch = {"input_ids": input_ids, "pixel_values": pixel_values, "latents": latents, "loss_weights": loss_weights}
-  return batch
+  return examples[0]


 def train(args):
@ -998,19 +1138,22 @@ def train(args):
    try:
      n_repeats = int(tokens[0])
    except ValueError as e:
-      print(f"no 'n_repeats' in directory name / DreamBoothのディレクトリ名に繰り返し回数がないようです: {dir}")
-      raise e
+      # print(f"no 'n_repeats' in directory name / DreamBoothのディレクトリ名に繰り返し回数がないようです: {dir}")
+      # raise e
+      return 0, []

    caption = '_'.join(tokens[1:])

-    img_paths = glob.glob(os.path.join(dir, "*.png")) + glob.glob(os.path.join(dir, "*.jpg"))
+    print(f"found directory {n_repeats}_{caption}")
+
+    img_paths = glob.glob(os.path.join(dir, "*.png")) + glob.glob(os.path.join(dir, "*.jpg")) + glob.glob(os.path.join(dir, "*.webp"))
    return n_repeats, [(ip, caption) for ip in img_paths]

  print("prepare train images.")
  train_img_path_captions = []

  if fine_tuning:
-    img_paths = glob.glob(os.path.join(args.train_data_dir, "*.png")) + glob.glob(os.path.join(args.train_data_dir, "*.jpg"))
+    img_paths = glob.glob(os.path.join(args.train_data_dir, "*.png")) + glob.glob(os.path.join(args.train_data_dir, "*.jpg")) + glob.glob(os.path.join(args.train_data_dir, "*.webp"))
    for img_path in tqdm(img_paths):
      # captionの候補ファイル名を作る
      base_name = os.path.splitext(img_path)[0]
@ -1042,7 +1185,7 @@ def train(args):
      n_repeats, img_caps = load_dreambooth_dir(os.path.join(args.train_data_dir, dir))
      for _ in range(n_repeats):
        train_img_path_captions.extend(img_caps)
-  print(f"{len(train_img_path_captions)} train images.")
+  print(f"{len(train_img_path_captions)} train images with repeating.")

  reg_img_path_captions = []
  if args.reg_data_dir:
@ -1054,11 +1197,6 @@ def train(args):
        reg_img_path_captions.extend(img_caps)
    print(f"{len(reg_img_path_captions)} reg images.")

-  if args.debug_dataset:
-    # デバッグ時はshuffleして実際のデータセット使用時に近づける（学習時はdata loaderでshuffleする）
-    random.shuffle(train_img_path_captions)
-    random.shuffle(reg_img_path_captions)
-
  # データセットを準備する
  resolution = tuple([int(r) for r in args.resolution.split(',')])
  if len(resolution) == 1:
@ -1078,21 +1216,25 @@ def train(args):
  tokenizer = CLIPTokenizer.from_pretrained(TOKENIZER_PATH)

  print("prepare dataset")
-  train_dataset = DreamBoothOrFineTuningDataset(fine_tuning, train_img_path_captions,
-                                                reg_img_path_captions, tokenizer, resolution, args.prior_loss_weight, args.flip_aug, args.color_aug, face_crop_aug_range, args.random_crop, args.shuffle_caption, args.no_token_padding, args.debug_dataset)
+  train_dataset = DreamBoothOrFineTuningDataset(args.train_batch_size, fine_tuning, train_img_path_captions, reg_img_path_captions, tokenizer, resolution,
+                                                args.prior_loss_weight, args.flip_aug, args.color_aug, face_crop_aug_range, args.random_crop,
+                                                args.shuffle_caption, args.no_token_padding, args.debug_dataset)

  if args.debug_dataset:
-    print(f"Total dataset length / データセットの長さ: {len(train_dataset)}")
+    train_dataset.make_buckets_with_caching(args.enable_bucket, None)                  # デバッグ用にcacheなしで作る
+    print(f"Total dataset length (steps) / データセットの長さ（ステップ数）: {len(train_dataset)}")
    print("Escape for exit. / Escキーで中断、終了します")
    for example in train_dataset:
-      im = example['image']
-      im = ((im.numpy() + 1.0) * 127.5).astype(np.uint8)
-      im = np.transpose(im, (1, 2, 0))                # c,H,W -> H,W,c
-      im = im[:, :, ::-1]                             # RGB -> BGR (OpenCV)
-      print(f'caption: "{example["caption"]}", loss weight: {example["loss_weight"]}')
-      cv2.imshow("img", im)
-      k = cv2.waitKey()
-      cv2.destroyAllWindows()
+      for im, cap, lw in zip(example['images'], example['captions'], example['loss_weights']):
+        im = ((im.numpy() + 1.0) * 127.5).astype(np.uint8)
+        im = np.transpose(im, (1, 2, 0))                # c,H,W -> H,W,c
+        im = im[:, :, ::-1]                             # RGB -> BGR (OpenCV)
+        print(f'size: {im.shape[1]}*{im.shape[0]}, caption: "{cap}", loss weight: {lw}')
+        cv2.imshow("img", im)
+        k = cv2.waitKey()
+        cv2.destroyAllWindows()
+        if k == 27:
+          break
      if k == 27:
        break
    return
@ -1100,7 +1242,14 @@ def train(args):
  # acceleratorを準備する
  # gradient accumulationは複数モデルを学習する場合には対応していないとのことなので、1固定にする
  print("prepare accelerator")
-  accelerator = Accelerator(gradient_accumulation_steps=1, mixed_precision=args.mixed_precision)
+  if args.logging_dir is None:
+    log_with = None
+    logging_dir = None
+  else:
+    log_with = "tensorboard"
+    logging_dir = args.logging_dir + "/" + time.strftime('%Y%m%d%H%M%S', time.localtime())
+  accelerator = Accelerator(gradient_accumulation_steps=1, mixed_precision=args.mixed_precision,
+                            log_with=log_with, logging_dir=logging_dir)

  # モデルを読み込む
  if use_stable_diffusion_format:
@ -1122,28 +1271,24 @@ def train(args):
  elif args.mixed_precision == "bf16":
    weight_dtype = torch.bfloat16

+  save_dtype = None
+  if args.save_precision == "fp16":
+    save_dtype = torch.float16
+  elif args.save_precision == "bf16":
+    save_dtype = torch.bfloat16
+  elif args.save_precision == "float":
+    save_dtype = torch.float32
+
  # 学習を準備する
  if cache_latents:
-    # latentをcacheする→新しいDatasetを作るとcaptionのshuffleが効かないので元のDatasetにcacheを持つ（cascadeする手もあるが）
-    print("caching latents.")
    vae.to(accelerator.device, dtype=weight_dtype)
-
-    for i in tqdm(range(len(train_dataset))):
-      example = train_dataset[i]
-      if 'latents' not in example:
-        image_path = example['image_path']
-        with torch.no_grad():
-          pixel_values = example["image"].unsqueeze(0).to(device=accelerator.device, dtype=weight_dtype)
-          latents = vae.encode(pixel_values).latent_dist.sample().squeeze(0).to("cpu")
-          train_dataset.set_cached_latents(image_path, latents)
-    # assertion
-    for i in range(len(train_dataset)):
-      assert 'latents' in train_dataset[i], "internal error: latents not cached"
-
+    with torch.no_grad():
+      train_dataset.make_buckets_with_caching(args.enable_bucket, vae)
    del vae
    if torch.cuda.is_available():
      torch.cuda.empty_cache()
  else:
+    train_dataset.make_buckets_with_caching(args.enable_bucket, None)
    vae.requires_grad_(False)

  if args.gradient_checkpointing:
@ -1173,7 +1318,7 @@ def train(args):
  # DataLoaderのプロセス数：0はメインプロセスになる
  n_workers = min(8, os.cpu_count() - 1)      # cpu_count-1 ただし最大8
  train_dataloader = torch.utils.data.DataLoader(
-      train_dataset, batch_size=args.train_batch_size, shuffle=True, collate_fn=collate_fn, num_workers=n_workers)
+      train_dataset, batch_size=1, shuffle=False, collate_fn=collate_fn, num_workers=n_workers)

  # lr schedulerを用意する
  lr_scheduler = diffusers.optimization.get_scheduler("constant", optimizer, num_training_steps=args.max_train_steps)
@ -1185,6 +1330,11 @@ def train(args):
  if not cache_latents:
    vae.to(accelerator.device, dtype=weight_dtype)

+  # resumeする
+  if args.resume is not None:
+    print(f"resume training from state: {args.resume}")
+    accelerator.load_state(args.resume)
+
  # epoch数を計算する
  num_train_epochs = math.ceil(args.max_train_steps / len(train_dataloader))

@ -1193,7 +1343,7 @@ def train(args):
  print("running training / 学習開始")
  print(f"  num train images * repeats / 学習画像の数×繰り返し回数: {train_dataset.num_train_images}")
  print(f"  num reg images / 正則化画像の数: {train_dataset.num_reg_images}")
-  print(f"  num examples / サンプル数: {len(train_dataset)}")
+  print(f"  num examples / サンプル数: {train_dataset.num_train_images * 2}")
  print(f"  num batches per epoch / 1epochのバッチ数: {len(train_dataloader)}")
  print(f"  num epochs / epoch数: {num_train_epochs}")
  print(f"  batch size per device / バッチサイズ: {args.train_batch_size}")
@ -1222,7 +1372,7 @@ def train(args):
          if cache_latents:
            latents = batch["latents"].to(accelerator.device)
          else:
-            latents = vae.encode(batch["pixel_values"].to(dtype=weight_dtype)).latent_dist.sample()
+            latents = vae.encode(batch["images"].to(dtype=weight_dtype)).latent_dist.sample()
          latents = latents * 0.18215

        # Sample noise that we'll add to the latents
@ -1271,15 +1421,22 @@ def train(args):
        global_step += 1

      current_loss = loss.detach().item()
+      if args.logging_dir is not None:
+        logs = {"loss": current_loss, "lr": lr_scheduler.get_last_lr()[0]}
+        accelerator.log(logs, step=global_step)
+
      loss_total += current_loss
      avr_loss = loss_total / (step+1)
      logs = {"loss": avr_loss}  # , "lr": lr_scheduler.get_last_lr()[0]}
      progress_bar.set_postfix(**logs)
-      # accelerator.log(logs, step=global_step)

      if global_step >= args.max_train_steps:
        break

+    if args.logging_dir is not None:
+      logs = {"epoch_loss": loss_total / len(train_dataloader)}
+      accelerator.log(logs, step=epoch+1)
+
    accelerator.wait_for_everyone()

    if use_stable_diffusion_format and args.save_every_n_epochs is not None:
@ -1288,7 +1445,11 @@ def train(args):
        os.makedirs(args.output_dir, exist_ok=True)
        ckpt_file = os.path.join(args.output_dir, EPOCH_CHECKPOINT_NAME.format(epoch + 1))
        save_stable_diffusion_checkpoint(ckpt_file, accelerator.unwrap_model(text_encoder), accelerator.unwrap_model(unet),
-                                         args.pretrained_model_name_or_path, epoch + 1, global_step)
+                                         args.pretrained_model_name_or_path, epoch + 1, global_step, save_dtype)
+
+        if args.save_state:
+          print("saving state.")
+          accelerator.save_state(os.path.join(args.output_dir, EPOCH_STATE_NAME.format(epoch + 1)))

  is_main_process = accelerator.is_main_process
  if is_main_process:
@ -1296,6 +1457,11 @@ def train(args):
    text_encoder = accelerator.unwrap_model(text_encoder)

  accelerator.end_training()
+  
+  if args.save_state:
+    print("saving last state.")
+    accelerator.save_state(os.path.join(args.output_dir, LAST_STATE_NAME))
+
  del accelerator                         # この後メモリを使うのでこれは消す

  if is_main_process:
@ -1303,7 +1469,8 @@ def train(args):
    if use_stable_diffusion_format:
      ckpt_file = os.path.join(args.output_dir, LAST_CHECKPOINT_NAME)
      print(f"save trained model as StableDiffusion checkpoint to {ckpt_file}")
-      save_stable_diffusion_checkpoint(ckpt_file, text_encoder, unet, args.pretrained_model_name_or_path, epoch, global_step)
+      save_stable_diffusion_checkpoint(ckpt_file, text_encoder, unet,
+                                       args.pretrained_model_name_or_path, epoch, global_step, save_dtype)
    else:
      # Create the pipeline using using the trained modules and save it.
      print(f"save trained model as Diffusers to {args.output_dir}")
@ -1589,6 +1756,10 @@ if __name__ == '__main__':
                      help="directory to output trained model, save as same format as input / 学習後のモデル出力先ディレクトリ（入力と同じ形式で保存）")
  parser.add_argument("--save_every_n_epochs", type=int, default=None,
                      help="save checkpoint every N epochs (only supports in StableDiffusion checkpoint) / 学習中のモデルを指定エポックごとに保存します（StableDiffusion形式のモデルを読み込んだ場合のみ有効）")
+  parser.add_argument("--save_state", action="store_true",
+                      help="save training state additionally (including optimizer states etc.) / optimizerなど学習状態も含めたstateを追加で保存する")
+  parser.add_argument("--resume", type=str, default=None,
+                      help="saved state to resume training / 学習再開するモデルのstate")
  parser.add_argument("--prior_loss_weight", type=float, default=1.0, help="loss weight for regularization images / 正則化画像のlossの重み")
  parser.add_argument("--no_token_padding", action="store_true",
                      help="disable token padding (same as Diffuser's DreamBooth) / トークンのpaddingを無効にする（Diffusers版DreamBoothと同じ動作）")
@ -1612,6 +1783,8 @@ if __name__ == '__main__':
                      help="use xformers for CrossAttention / CrossAttentionにxformersを使う")
  parser.add_argument("--cache_latents", action="store_true",
                      help="cache latents to reduce memory (augmentations must be disabled) / メモリ削減のためにlatentをcacheする（augmentationは使用不可）")
+  parser.add_argument("--enable_bucket", action="store_true",
+                      help="enable buckets for multi aspect ratio training / 複数解像度学習のためのbucketを有効にする")
  parser.add_argument("--learning_rate", type=float, default=2.0e-6, help="learning rate / 学習率")
  parser.add_argument("--max_train_steps", type=int, default=1600, help="training steps / 学習ステップ数")
  parser.add_argument("--seed", type=int, default=None, help="random seed for training / 学習時の乱数のseed")
@ -1619,8 +1792,12 @@ if __name__ == '__main__':
                      help="enable gradient checkpointing / grandient checkpointingを有効にする")
  parser.add_argument("--mixed_precision", type=str, default="no",
                      choices=["no", "fp16", "bf16"], help="use mixed precision / 混合精度を使う場合、その精度")
+  parser.add_argument("--save_precision", type=str, default=None,
+                      choices=[None, "float", "fp16", "bf16"], help="precision in saving / 保存時に精度を変更して保存する")
  parser.add_argument("--clip_skip", type=int, default=None,
                      help="use output of nth layer from back of text encoder (n>=1) / text encoderの後ろからn番目の層の出力を用いる（nは1以上）")
+  parser.add_argument("--logging_dir", type=str, default=None,
+                      help="enable logging and output TensorBoard log to this directory / ログ出力を有効にしてこのディレクトリにTensorBoard用のログを出力する")

  args = parser.parse_args()
  train(args)