Merge pull request #59 from bmaltais/dev

v20.3.0 release
This commit is contained in:
bmaltais 2023-01-19 15:48:18 -05:00 committed by GitHub
commit 29ec8658a5
No known key found for this signature in database
GPG Key ID: 4AEE18F83AFDEB23
7 changed files with 104 additions and 15 deletions

View File

@ -6,7 +6,7 @@ This repository repository is providing a Gradio GUI for kohya's Stable Diffusio
Python 3.10.6+ and Git:
- Python 3.10.6+: https://www.python.org/ftp/python/3.10.6/python-3.10.6-amd64.exe
- Python 3.10.6+: https://www.python.org/ftp/python/3.10.9/python-3.10.9-amd64.exe
- git: https://git-scm.com/download/win
## Installation
@ -27,7 +27,7 @@ python -m venv --system-site-packages venv
.\venv\Scripts\activate
pip install torch==1.12.1+cu116 torchvision==0.13.1+cu116 --extra-index-url https://download.pytorch.org/whl/cu116
pip install --upgrade -r requirements.txt
pip install --use-pep517 --upgrade -r requirements.txt
pip install -U -I --no-deps https://github.com/C43H66N12O12S2/stable-diffusion-webui/releases/download/f/xformers-0.0.14.dev0-cp310-cp310-win_amd64.whl
cp .\bitsandbytes_windows\*.dll .\venv\Lib\site-packages\bitsandbytes\
@ -61,7 +61,7 @@ When a new release comes out you can upgrade your repo with the following comman
cd kohya_ss
git pull
.\venv\Scripts\activate
pip install --upgrade -r requirements.txt
pip install --use-pep517 --upgrade -r requirements.txt
```
Once the commands have completed successfully you should be ready to use the new version.
@ -104,8 +104,48 @@ python lora_gui.py
Once you have created the LoRA network you can generate images via auto1111 by installing the extension found here: https://github.com/kohya-ss/sd-webui-additional-networks
## Troubleshooting
### Page file limit
- if get X error relating to `page file`, increase page file size limit in Windows
### No module called tkinter
- Re-install python 3.10.x on your system: https://www.python.org/ftp/python/3.10.9/python-3.10.9-amd64.exe
## Change history
* 2023/01/16 (v20.3.0)
- Fix a part of LoRA modules are not trained when ``gradient_checkpointing`` is enabled.
- Add ``--save_last_n_epochs_state`` option. You can specify how many state folders to keep, apart from how many models to keep. Thanks to shirayu!
- Fix Text Encoder training stops at ``max_train_steps`` even if ``max_train_epochs`` is set in `train_db.py``.
- Added script to check LoRA weights. You can check weights by ``python networks\check_lora_weights.py <model file>``. If some modules are not trained, the value is ``0.0`` like following.
- ``lora_te_text_model_encoder_layers_11_*`` is not trained with ``clip_skip=2``, so ``0.0`` is okay for these modules.
- example result of ``check_lora_weights.py``, Text Encoder and a part of U-Net are not trained:
```
number of LoRA-up modules: 264
lora_te_text_model_encoder_layers_0_mlp_fc1.lora_up.weight,0.0
lora_te_text_model_encoder_layers_0_mlp_fc2.lora_up.weight,0.0
lora_te_text_model_encoder_layers_0_self_attn_k_proj.lora_up.weight,0.0
:
lora_unet_down_blocks_2_attentions_1_transformer_blocks_0_ff_net_0_proj.lora_up.weight,0.0
lora_unet_down_blocks_2_attentions_1_transformer_blocks_0_ff_net_2.lora_up.weight,0.0
lora_unet_mid_block_attentions_0_proj_in.lora_up.weight,0.003503334941342473
lora_unet_mid_block_attentions_0_proj_out.lora_up.weight,0.004308608360588551
:
```
- all modules are trained:
```
number of LoRA-up modules: 264
lora_te_text_model_encoder_layers_0_mlp_fc1.lora_up.weight,0.0028684409335255623
lora_te_text_model_encoder_layers_0_mlp_fc2.lora_up.weight,0.0029794853180646896
lora_te_text_model_encoder_layers_0_self_attn_k_proj.lora_up.weight,0.002507600700482726
lora_te_text_model_encoder_layers_0_self_attn_out_proj.lora_up.weight,0.002639499492943287
:
```
* 2023/01/16 (v20.2.1):
- Merging latest code update from kohya
- Added `--max_train_epochs` and `--max_data_loader_n_workers` option for each training script.

10
gui.bat Normal file
View File

@ -0,0 +1,10 @@
@echo off
set VENV_DIR=.\venv
set PYTHON=python
call %VENV_DIR%\Scripts\activate.bat
%PYTHON% kohya_gui.py
pause

View File

@ -1029,6 +1029,7 @@ def add_training_arguments(parser: argparse.ArgumentParser, support_dreambooth:
parser.add_argument("--save_every_n_epochs", type=int, default=None,
help="save checkpoint every N epochs / 学習中のモデルを指定エポックごとに保存する")
parser.add_argument("--save_last_n_epochs", type=int, default=None, help="save last N checkpoints / 最大Nエポック保存する")
parser.add_argument("--save_last_n_epochs_state", type=int, default=None, help="save last N checkpoints of state (overrides the value of --save_last_n_epochs)/ 最大Nエポックstateを保存する(--save_last_n_epochsの指定を上書きします)")
parser.add_argument("--save_state", action="store_true",
help="save training state additionally (including optimizer states etc.) / optimizerなど学習状態も含めたstateを追加で保存する")
parser.add_argument("--resume", type=str, default=None, help="saved state to resume training / 学習再開するモデルのstate")
@ -1298,7 +1299,6 @@ def get_epoch_ckpt_name(args: argparse.Namespace, use_safetensors, epoch):
def save_on_epoch_end(args: argparse.Namespace, save_func, remove_old_func, epoch_no: int, num_train_epochs: int):
saving = epoch_no % args.save_every_n_epochs == 0 and epoch_no < num_train_epochs
remove_epoch_no = None
if saving:
os.makedirs(args.output_dir, exist_ok=True)
save_func()
@ -1306,7 +1306,7 @@ def save_on_epoch_end(args: argparse.Namespace, save_func, remove_old_func, epoc
if args.save_last_n_epochs is not None:
remove_epoch_no = epoch_no - args.save_every_n_epochs * args.save_last_n_epochs
remove_old_func(remove_epoch_no)
return saving, remove_epoch_no
return saving
def save_sd_model_on_epoch_end(args: argparse.Namespace, accelerator, src_path: str, save_stable_diffusion_format: bool, use_safetensors: bool, save_dtype: torch.dtype, epoch: int, num_train_epochs: int, global_step: int, text_encoder, unet, vae):
@ -1346,15 +1346,18 @@ def save_sd_model_on_epoch_end(args: argparse.Namespace, accelerator, src_path:
save_func = save_du
remove_old_func = remove_du
saving, remove_epoch_no = save_on_epoch_end(args, save_func, remove_old_func, epoch_no, num_train_epochs)
saving = save_on_epoch_end(args, save_func, remove_old_func, epoch_no, num_train_epochs)
if saving and args.save_state:
save_state_on_epoch_end(args, accelerator, model_name, epoch_no, remove_epoch_no)
save_state_on_epoch_end(args, accelerator, model_name, epoch_no)
def save_state_on_epoch_end(args: argparse.Namespace, accelerator, model_name, epoch_no, remove_epoch_no):
def save_state_on_epoch_end(args: argparse.Namespace, accelerator, model_name, epoch_no):
print("saving state.")
accelerator.save_state(os.path.join(args.output_dir, EPOCH_STATE_NAME.format(model_name, epoch_no)))
if remove_epoch_no is not None:
last_n_epochs = args.save_last_n_epochs_state if args.save_last_n_epochs_state else args.save_last_n_epochs
if last_n_epochs is not None:
remove_epoch_no = epoch_no - args.save_every_n_epochs * last_n_epochs
state_dir_old = os.path.join(args.output_dir, EPOCH_STATE_NAME.format(model_name, remove_epoch_no))
if os.path.exists(state_dir_old):
print(f"removing old state: {state_dir_old}")

View File

@ -0,0 +1,31 @@
import argparse
import os
import torch
from safetensors.torch import load_file
def main(file):
print(f"loading: {file}")
if os.path.splitext(file)[1] == '.safetensors':
sd = load_file(file)
else:
sd = torch.load(file, map_location='cpu')
values = []
keys = list(sd.keys())
for key in keys:
if 'lora_up' in key:
values.append((key, sd[key]))
print(f"number of LoRA-up modules: {len(values)}")
for key, value in values:
print(f"{key},{torch.mean(torch.abs(value))}")
if __name__ == '__main__':
parser = argparse.ArgumentParser()
parser.add_argument("file", type=str, help="model file to check / 重みを確認するモデルファイル")
args = parser.parse_args()
main(args.file)

View File

@ -12,6 +12,8 @@ safetensors==0.2.6
gradio==3.15.0
altair
easygui
tk
tkinter
# for BLIP captioning
requests
timm

View File

@ -92,10 +92,7 @@ def train(args):
gc.collect()
# 学習を準備する:モデルを適切な状態にする
if args.stop_text_encoder_training is None:
args.stop_text_encoder_training = args.max_train_steps + 1 # do not stop until end
train_text_encoder = args.stop_text_encoder_training >= 0
train_text_encoder = args.stop_text_encoder_training is None or args.stop_text_encoder_training >= 0
unet.requires_grad_(True) # 念のため追加
text_encoder.requires_grad_(train_text_encoder)
if not train_text_encoder:
@ -143,6 +140,9 @@ def train(args):
args.max_train_steps = args.max_train_epochs * len(train_dataloader)
print(f"override steps. steps for {args.max_train_epochs} epochs is / 指定エポックまでのステップ数: {args.max_train_steps}")
if args.stop_text_encoder_training is None:
args.stop_text_encoder_training = args.max_train_steps + 1 # do not stop until end
# lr schedulerを用意する
lr_scheduler = diffusers.optimization.get_scheduler(
args.lr_scheduler, optimizer, num_warmup_steps=args.lr_warmup_steps, num_training_steps=args.max_train_steps)

View File

@ -166,6 +166,9 @@ def train(args):
if args.gradient_checkpointing: # according to TI example in Diffusers, train is required
unet.train()
text_encoder.train()
# set top parameter requires_grad = True for gradient checkpointing works
text_encoder.text_model.embeddings.requires_grad_(True)
else:
unet.eval()
text_encoder.eval()
@ -364,9 +367,9 @@ def train(args):
print(f"removing old checkpoint: {old_ckpt_file}")
os.remove(old_ckpt_file)
saving, remove_epoch_no = train_util.save_on_epoch_end(args, save_func, remove_old_func, epoch + 1, num_train_epochs)
saving = train_util.save_on_epoch_end(args, save_func, remove_old_func, epoch + 1, num_train_epochs)
if saving and args.save_state:
train_util.save_state_on_epoch_end(args, accelerator, model_name, epoch + 1, remove_epoch_no)
train_util.save_state_on_epoch_end(args, accelerator, model_name, epoch + 1)
# end of epoch