diff --git a/BLIP_caption/configs/med_config.json b/BLIP_caption/configs/med_config.json deleted file mode 100644 index 0ffad0a..0000000 --- a/BLIP_caption/configs/med_config.json +++ /dev/null @@ -1,21 +0,0 @@ -{ - "architectures": [ - "BertModel" - ], - "attention_probs_dropout_prob": 0.1, - "hidden_act": "gelu", - "hidden_dropout_prob": 0.1, - "hidden_size": 768, - "initializer_range": 0.02, - "intermediate_size": 3072, - "layer_norm_eps": 1e-12, - "max_position_embeddings": 512, - "model_type": "bert", - "num_attention_heads": 12, - "num_hidden_layers": 12, - "pad_token_id": 0, - "type_vocab_size": 2, - "vocab_size": 30524, - "encoder_width": 768, - "add_cross_attention": true -} diff --git a/README.md b/README.md index 5b946fc..a1d6482 100644 --- a/README.md +++ b/README.md @@ -1,194 +1,13 @@ -# HOWTO +# Kohya's dreambooth and finetuning -This repo provide all the required config to run the Dreambooth version found in this note: https://note.com/kohya_ss/n/nee3ed1649fb6 -The setup of bitsandbytes with Adam8bit support for windows: https://note.com/kohya_ss/n/n47f654dc161e +This repo now combine bot Kohya_ss solution under one roof. I am merging both under a single repo to align with the new official kohya repo where he will maintain his code from now on: https://github.com/kohya-ss/sd-scripts -## Required Dependencies +A new note accompaning the release of his new repo can be found here: https://note.com/kohya_ss/n/nba4eceaa4594 -Python 3.10.6 and Git: +## Dreambooth -- Python 3.10.6: https://www.python.org/ftp/python/3.10.6/python-3.10.6-amd64.exe -- git: https://git-scm.com/download/win +You can find the dreambooth solution spercific [Dreambooth README](README_dreambooth.md) -Give unrestricted script access to powershell so venv can work: +## Finetune -- Open an administrator powershell window -- Type `Set-ExecutionPolicy Unrestricted` and answer A -- Close admin powershell window - -## Installation - -Open a regular Powershell terminal and type the following inside: - -```powershell -git clone https://github.com/bmaltais/kohya_ss.git -cd kohya_ss - -python -m venv --system-site-packages venv -.\venv\Scripts\activate - -pip install torch==1.12.1+cu116 torchvision==0.13.1+cu116 --extra-index-url https://download.pytorch.org/whl/cu116 -pip install --upgrade -r requirements.txt -pip install -U -I --no-deps https://github.com/C43H66N12O12S2/stable-diffusion-webui/releases/download/f/xformers-0.0.14.dev0-cp310-cp310-win_amd64.whl - -cp .\bitsandbytes_windows\*.dll .\venv\Lib\site-packages\bitsandbytes\ -cp .\bitsandbytes_windows\cextension.py .\venv\Lib\site-packages\bitsandbytes\cextension.py -cp .\bitsandbytes_windows\main.py .\venv\Lib\site-packages\bitsandbytes\cuda_setup\main.py - -accelerate config - -``` - -Answers to accelerate config: - -```txt -- 0 -- 0 -- NO -- NO -- All -- fp16 -``` - -### Optional: CUDNN 8.6 - -This step is optional but can improve the learning speed for NVidia 4090 owners... - -Due to the filesize I can't host the DLLs needed for CUDNN 8.6 on Github, I strongly advise you download them for a speed boost in sample generation (almost 50% on 4090) you can download them from here: https://b1.thefileditch.ch/mwxKTEtelILoIbMbruuM.zip - -To install simply unzip the directory and place the cudnn_windows folder in the root of the kohya_diffusers_fine_tuning repo. - -Run the following command to install: - -``` -python cudann_1.8_install.py -``` - -## Upgrade - -When a new release comes out you can upgrade your repo with the following command: - -```powershell -cd kohya_ss -git pull -.\venv\Scripts\activate -pip install --upgrade -r requirements.txt -``` - -Once the commands have completed successfully you should be ready to use the new version. - -## GUI - -There is now support for GUI based training using gradio. You can start the GUI interface by running: - -```powershell -python .\dreambooth_gui.py -``` - -## Quickstart screencast - -You can find a screen cast on how to use the GUI at the following location: - -[![Video](https://img.youtube.com/vi/RlvqEKj03WI/maxresdefault.jpg)](https://www.youtube.com/watch?v=RlvqEKj03WI) - -## Folders configuration - -Refer to the note to understand how to create the folde structure. In short it should look like: - -``` - -|- - |- _ -|- - |- _ -``` - -Example for `asd dog` where `asd` is the token word and `dog` is the class. In this example the regularization `dog` class images contained in the folder will be repeated only 1 time and the `asd dog` images will be repeated 20 times: - -``` -my_asd_dog_dreambooth -|- reg_dog - |- 1_dog - `- reg_image_1.png - `- reg_image_2.png - ... - `- reg_image_256.png -|- train_dog - |- 20_asd dog - `- dog1.png - ... - `- dog8.png -``` - -## Support - -Drop by the discord server for support: https://discord.com/channels/1041518562487058594/1041518563242020906 - -## Contributors - -- Lord of the universe - cacoe (twitter: @cac0e) - -## Change history - -* 12/19 (v18.4) update: - - Add support for shuffle_caption, save_state, resume, prior_loss_weight under "Advanced Configuration" section - - Fix issue with open/save config not working properly -* 12/19 (v18.3) update: - - fix stop encoder training issue -* 12/19 (v18.2) update: - - Fix file/folder opening behind the browser window - - Add WD14 and BLIP captioning to utilities - - Improve overall GUI layout -* 12/18 (v18.1) update: - - Add Stable Diffusion model conversion utility. Make sure to run `pip upgrade -U -r requirements.txt` after updating to this release as this introduce new pip requirements. -* 12/17 (v18) update: - - Save model as option added to train_db_fixed.py - - Save model as option added to GUI - - Retire "Model conversion" parameters that was essentially performing the same function as the new `--save_model_as` parameter -* 12/17 (v17.2) update: - - Adding new dataset balancing utility. -* 12/17 (v17.1) update: - - Adding GUI for kohya_ss called dreambooth_gui.py - - removing support for `--finetuning` as there is now a dedicated python repo for that. `--fine-tuning` is still there behind the scene until kohya_ss remove it in a future code release. - - removing cli examples as I will now focus on the GUI for training. People who prefer cli based training can still do that. -* 12/13 (v17) update: - - Added support for learning to fp16 gradient (experimental function). SD1.x can be trained with 8GB of VRAM. Specify full_fp16 options. -* 12/06 (v16) update: - - Added support for Diffusers 0.10.2 (use code in Diffusers to learn v-parameterization). - - Diffusers also supports safetensors. - - Added support for accelerate 0.15.0. -* 12/05 (v15) update: - - The script has been divided into two parts - - Support for SafeTensors format has been added. Install SafeTensors with `pip install safetensors`. The script will automatically detect the format based on the file extension when loading. Use the `--use_safetensors` option if you want to save the model as safetensor. - - The vae option has been added to load a VAE model separately. - - The log_prefix option has been added to allow adding a custom string to the log directory name before the date and time. -* 11/30 (v13) update: - - fix training text encoder at specified step (`--stop_text_encoder_training=`) that was causing both Unet and text encoder training to stop completely at the specified step rather than continue without text encoding training. -* 11/29 (v12) update: - - stop training text encoder at specified step (`--stop_text_encoder_training=`) - - tqdm smoothing - - updated fine tuning script to support SD2.0 768/v -* 11/27 (v11) update: - - DiffUsers 0.9.0 is required. Update with `pip install --upgrade -r requirements.txt` in the virtual environment. - - The way captions are handled in DreamBooth has changed. When a caption file existed, the file's caption was added to the folder caption until v10, but from v11 it is only the file's caption. Please be careful. - - Fixed a bug where prior_loss_weight was applied to learning images. Sorry for the inconvenience. - - Compatible with Stable Diffusion v2.0. Add the `--v2` option. If you are using `768-v-ema.ckpt` or `stable-diffusion-2` instead of `stable-diffusion-v2-base`, add `--v_parameterization` as well. Learn more about other options. - - Added options related to the learning rate scheduler. - - You can download and use DiffUsers models directly from Hugging Face. In addition, DiffUsers models can be saved during training. -* 11/21 (v10): - - Added minimum/maximum resolution specification when using Aspect Ratio Bucketing (min_bucket_reso/max_bucket_reso option). - - Added extension specification for caption files (caption_extention). - - Added support for images with .webp extension. - - Added a function that allows captions to learning images and regularized images. -* 11/18 (v9): - - Added support for Aspect Ratio Bucketing (enable_bucket option). (--enable_bucket) - - Added support for selecting data format (fp16/bf16/float) when saving checkpoint (--save_precision) - - Added support for saving learning state (--save_state, --resume) - - Added support for logging (--logging_dir) -* 11/14 (diffusers_fine_tuning v2): - - script name is now fine_tune.py. - - Added option to learn Text Encoder --train_text_encoder. - - The data format of checkpoint at the time of saving can be specified with the --save_precision option. You can choose float, fp16, and bf16. - - Added a --save_state option to save the learning state (optimizer, etc.) in the middle. It can be resumed with the --resume option. -* 11/9 (v8): supports Diffusers 0.7.2. To upgrade diffusers run `pip install --upgrade diffusers[torch]` -* 11/7 (v7): Text Encoder supports checkpoint files in different storage formats (it is converted at the time of import, so export will be in normal format). Changed the average value of EPOCH loss to output to the screen. Added a function to save epoch and global step in checkpoint in SD format (add values if there is existing data). The reg_data_dir option is enabled during fine tuning (fine tuning while mixing regularized images). Added dataset_repeats option that is valid for fine tuning (specified when the number of teacher images is small and the epoch is extremely short). \ No newline at end of file +You can find the finetune solution spercific [Finetune README](README_finetune.md) \ No newline at end of file diff --git a/README_dreambooth.md b/README_dreambooth.md new file mode 100644 index 0000000..62e1738 --- /dev/null +++ b/README_dreambooth.md @@ -0,0 +1,203 @@ +# Kohya_ss Dreambooth + +This repo provide all the required code to run the Dreambooth version found in this note: https://note.com/kohya_ss/n/nee3ed1649fb6 + +## Required Dependencies + +Python 3.10.6 and Git: + +- Python 3.10.6: https://www.python.org/ftp/python/3.10.6/python-3.10.6-amd64.exe +- git: https://git-scm.com/download/win + +Give unrestricted script access to powershell so venv can work: + +- Open an administrator powershell window +- Type `Set-ExecutionPolicy Unrestricted` and answer A +- Close admin powershell window + +## Installation + +Open a regular Powershell terminal and type the following inside: + +```powershell +git clone https://github.com/bmaltais/kohya_ss.git +cd kohya_ss + +python -m venv --system-site-packages venv +.\venv\Scripts\activate + +pip install torch==1.12.1+cu116 torchvision==0.13.1+cu116 --extra-index-url https://download.pytorch.org/whl/cu116 +pip install --upgrade -r requirements.txt +pip install -U -I --no-deps https://github.com/C43H66N12O12S2/stable-diffusion-webui/releases/download/f/xformers-0.0.14.dev0-cp310-cp310-win_amd64.whl + +cp .\bitsandbytes_windows\*.dll .\venv\Lib\site-packages\bitsandbytes\ +cp .\bitsandbytes_windows\cextension.py .\venv\Lib\site-packages\bitsandbytes\cextension.py +cp .\bitsandbytes_windows\main.py .\venv\Lib\site-packages\bitsandbytes\cuda_setup\main.py + +accelerate config + +``` + +Answers to accelerate config: + +```txt +- 0 +- 0 +- NO +- NO +- All +- fp16 +``` + +### Optional: CUDNN 8.6 + +This step is optional but can improve the learning speed for NVidia 4090 owners... + +Due to the filesize I can't host the DLLs needed for CUDNN 8.6 on Github, I strongly advise you download them for a speed boost in sample generation (almost 50% on 4090) you can download them from here: https://b1.thefileditch.ch/mwxKTEtelILoIbMbruuM.zip + +To install simply unzip the directory and place the cudnn_windows folder in the root of the kohya_diffusers_fine_tuning repo. + +Run the following command to install: + +``` +python .\tools\cudann_1.8_install.py +``` + +## Upgrade + +When a new release comes out you can upgrade your repo with the following command: + +``` +.\upgrade.bat +``` + +or you can do it manually with + +```powershell +cd kohya_ss +git pull +.\venv\Scripts\activate +pip install --upgrade -r requirements.txt +``` + +Once the commands have completed successfully you should be ready to use the new version. + +## GUI + +There is now support for GUI based training using gradio. You can start the GUI interface by running: + +```powershell +.\dreambooth.bat +``` + +## CLI + +You can find various examples of how to leverage the fine_tune.py in this folder: https://github.com/bmaltais/kohya_ss/tree/master/examples + +## Quickstart screencast + +You can find a screen cast on how to use the GUI at the following location: + +[![Video](https://img.youtube.com/vi/RlvqEKj03WI/maxresdefault.jpg)](https://www.youtube.com/watch?v=RlvqEKj03WI) + +## Folders configuration + +Refer to the note to understand how to create the folde structure. In short it should look like: + +``` + +|- + |- _ +|- + |- _ +``` + +Example for `asd dog` where `asd` is the token word and `dog` is the class. In this example the regularization `dog` class images contained in the folder will be repeated only 1 time and the `asd dog` images will be repeated 20 times: + +``` +my_asd_dog_dreambooth +|- reg_dog + |- 1_dog + `- reg_image_1.png + `- reg_image_2.png + ... + `- reg_image_256.png +|- train_dog + |- 20_asd dog + `- dog1.png + ... + `- dog8.png +``` + +## Support + +Drop by the discord server for support: https://discord.com/channels/1041518562487058594/1041518563242020906 + +## Contributors + +- Lord of the universe - cacoe (twitter: @cac0e) + +## Change history + +* 12/19 (v18.4) update: + - Add support for shuffle_caption, save_state, resume, prior_loss_weight under "Advanced Configuration" section + - Fix issue with open/save config not working properly +* 12/19 (v18.3) update: + - fix stop encoder training issue +* 12/19 (v18.2) update: + - Fix file/folder opening behind the browser window + - Add WD14 and BLIP captioning to utilities + - Improve overall GUI layout +* 12/18 (v18.1) update: + - Add Stable Diffusion model conversion utility. Make sure to run `pip upgrade -U -r requirements.txt` after updating to this release as this introduce new pip requirements. +* 12/17 (v18) update: + - Save model as option added to train_db_fixed.py + - Save model as option added to GUI + - Retire "Model conversion" parameters that was essentially performing the same function as the new `--save_model_as` parameter +* 12/17 (v17.2) update: + - Adding new dataset balancing utility. +* 12/17 (v17.1) update: + - Adding GUI for kohya_ss called dreambooth_gui.py + - removing support for `--finetuning` as there is now a dedicated python repo for that. `--fine-tuning` is still there behind the scene until kohya_ss remove it in a future code release. + - removing cli examples as I will now focus on the GUI for training. People who prefer cli based training can still do that. +* 12/13 (v17) update: + - Added support for learning to fp16 gradient (experimental function). SD1.x can be trained with 8GB of VRAM. Specify full_fp16 options. +* 12/06 (v16) update: + - Added support for Diffusers 0.10.2 (use code in Diffusers to learn v-parameterization). + - Diffusers also supports safetensors. + - Added support for accelerate 0.15.0. +* 12/05 (v15) update: + - The script has been divided into two parts + - Support for SafeTensors format has been added. Install SafeTensors with `pip install safetensors`. The script will automatically detect the format based on the file extension when loading. Use the `--use_safetensors` option if you want to save the model as safetensor. + - The vae option has been added to load a VAE model separately. + - The log_prefix option has been added to allow adding a custom string to the log directory name before the date and time. +* 11/30 (v13) update: + - fix training text encoder at specified step (`--stop_text_encoder_training=`) that was causing both Unet and text encoder training to stop completely at the specified step rather than continue without text encoding training. +* 11/29 (v12) update: + - stop training text encoder at specified step (`--stop_text_encoder_training=`) + - tqdm smoothing + - updated fine tuning script to support SD2.0 768/v +* 11/27 (v11) update: + - DiffUsers 0.9.0 is required. Update with `pip install --upgrade -r requirements.txt` in the virtual environment. + - The way captions are handled in DreamBooth has changed. When a caption file existed, the file's caption was added to the folder caption until v10, but from v11 it is only the file's caption. Please be careful. + - Fixed a bug where prior_loss_weight was applied to learning images. Sorry for the inconvenience. + - Compatible with Stable Diffusion v2.0. Add the `--v2` option. If you are using `768-v-ema.ckpt` or `stable-diffusion-2` instead of `stable-diffusion-v2-base`, add `--v_parameterization` as well. Learn more about other options. + - Added options related to the learning rate scheduler. + - You can download and use DiffUsers models directly from Hugging Face. In addition, DiffUsers models can be saved during training. +* 11/21 (v10): + - Added minimum/maximum resolution specification when using Aspect Ratio Bucketing (min_bucket_reso/max_bucket_reso option). + - Added extension specification for caption files (caption_extention). + - Added support for images with .webp extension. + - Added a function that allows captions to learning images and regularized images. +* 11/18 (v9): + - Added support for Aspect Ratio Bucketing (enable_bucket option). (--enable_bucket) + - Added support for selecting data format (fp16/bf16/float) when saving checkpoint (--save_precision) + - Added support for saving learning state (--save_state, --resume) + - Added support for logging (--logging_dir) +* 11/14 (diffusers_fine_tuning v2): + - script name is now fine_tune.py. + - Added option to learn Text Encoder --train_text_encoder. + - The data format of checkpoint at the time of saving can be specified with the --save_precision option. You can choose float, fp16, and bf16. + - Added a --save_state option to save the learning state (optimizer, etc.) in the middle. It can be resumed with the --resume option. +* 11/9 (v8): supports Diffusers 0.7.2. To upgrade diffusers run `pip install --upgrade diffusers[torch]` +* 11/7 (v7): Text Encoder supports checkpoint files in different storage formats (it is converted at the time of import, so export will be in normal format). Changed the average value of EPOCH loss to output to the screen. Added a function to save epoch and global step in checkpoint in SD format (add values if there is existing data). The reg_data_dir option is enabled during fine tuning (fine tuning while mixing regularized images). Added dataset_repeats option that is valid for fine tuning (specified when the number of teacher images is small and the epoch is extremely short). \ No newline at end of file diff --git a/README_finetune.md b/README_finetune.md new file mode 100644 index 0000000..30098df --- /dev/null +++ b/README_finetune.md @@ -0,0 +1,167 @@ +# Kohya_ss Finetune + +This python utility provide code to run the diffusers fine tuning version found in this note: https://note.com/kohya_ss/n/nbf7ce8d80f29 + +## Required Dependencies + +Python 3.10.6 and Git: + +- Python 3.10.6: https://www.python.org/ftp/python/3.10.6/python-3.10.6-amd64.exe +- git: https://git-scm.com/download/win + +Give unrestricted script access to powershell so venv can work: + +- Open an administrator powershell window +- Type `Set-ExecutionPolicy Unrestricted` and answer A +- Close admin powershell window + +## Installation + +Open a regular Powershell terminal and type the following inside: + +```powershell +git clone https://github.com/bmaltais/kohya_diffusers_fine_tuning.git +cd kohya_diffusers_fine_tuning + +python -m venv --system-site-packages venv +.\venv\Scripts\activate + +pip install torch==1.12.1+cu116 torchvision==0.13.1+cu116 --extra-index-url https://download.pytorch.org/whl/cu116 +pip install --upgrade -r requirements.txt +pip install -U -I --no-deps https://github.com/C43H66N12O12S2/stable-diffusion-webui/releases/download/f/xformers-0.0.14.dev0-cp310-cp310-win_amd64.whl + +cp .\bitsandbytes_windows\*.dll .\venv\Lib\site-packages\bitsandbytes\ +cp .\bitsandbytes_windows\cextension.py .\venv\Lib\site-packages\bitsandbytes\cextension.py +cp .\bitsandbytes_windows\main.py .\venv\Lib\site-packages\bitsandbytes\cuda_setup\main.py + +accelerate config + +``` + +Answers to accelerate config: + +```txt +- 0 +- 0 +- NO +- NO +- All +- fp16 +``` + +### Optional: CUDNN 8.6 + +This step is optional but can improve the learning speed for NVidia 4090 owners... + +Due to the filesize I can't host the DLLs needed for CUDNN 8.6 on Github, I strongly advise you download them for a speed boost in sample generation (almost 50% on 4090) you can download them from here: https://b1.thefileditch.ch/mwxKTEtelILoIbMbruuM.zip + +To install simply unzip the directory and place the cudnn_windows folder in the root of the kohya_diffusers_fine_tuning repo. + +Run the following command to install: + +``` +python .\tools\cudann_1.8_install.py +``` + +## Upgrade + +When a new release comes out you can upgrade your repo with the following command: + +``` +.\upgrade.bat +``` + +or you can do it manually with + +```powershell +cd kohya_ss +git pull +.\venv\Scripts\activate +pip install --upgrade -r requirements.txt +``` + +Once the commands have completed successfully you should be ready to use the new version. + +## Folders configuration + +Simply put all the images you will want to train on in a single directory. It does not matter what size or aspect ratio they have. It is your choice. + +## Captions + +Each file need to be accompanied by a caption file describing what the image is about. For example, if you want to train on cute dog pictures you can put `cute dog` as the caption in every file. You can use the `tools\caption.ps1` sample code to help out with that: + +```powershell +$folder = "sample" +$file_pattern="*.*" +$caption_text="cute dog" + +$files = Get-ChildItem "$folder\$file_pattern" -Include *.png, *.jpg, *.webp -File +foreach ($file in $files) { + if (-not(Test-Path -Path $folder\"$($file.BaseName).txt" -PathType Leaf)) { + New-Item -ItemType file -Path $folder -Name "$($file.BaseName).txt" -Value $caption_text + } +} + +You can also use the `Captioning` tool found under the `Utilities` tab in the GUI. +``` + +## GUI + +Support for GUI based training using gradio. You can start the GUI interface by running: + +```powershell +.\finetune.bat +``` + +## CLI + +You can find various examples of how to leverage the fine_tune.py in this folder: https://github.com/bmaltais/kohya_ss/tree/master/examples + +## Support + +Drop by the discord server for support: https://discord.com/channels/1041518562487058594/1041518563242020906 + +## Change history + +* 12/20 (v9.6) update: + - fix issue with config file save and opening +* 12/19 (v9.5) update: + - Fix file/folder dialog opening behind the browser window + - Update GUI layout to be more logical +* 12/18 (v9.4) update: + - Add WD14 tagging to utilities +* 12/18 (v9.3) update: + - Add logging option +* 12/18 (v9.2) update: + - Add BLIP Captioning utility +* 12/18 (v9.1) update: + - Add Stable Diffusion model conversion utility. Make sure to run `pip upgrade -U -r requirements.txt` after updating to this release as this introduce new pip requirements. +* 12/17 (v9) update: + - Save model as option added to fine_tune.py + - Save model as option added to GUI + - Retirement of cli based documentation. Will focus attention to GUI based training +* 12/13 (v8): + - WD14Tagger now works on its own. + - Added support for learning to fp16 up to the gradient. Go to "Building the environment and preparing scripts for Diffusers for more info". +* 12/10 (v7): + - We have added support for Diffusers 0.10.2. + - In addition, we have made other fixes. + - For more information, please see the section on "Building the environment and preparing scripts for Diffusers" in our documentation. +* 12/6 (v6): We have responded to reports that some models experience an error when saving in SafeTensors format. +* 12/5 (v5): + - .safetensors format is now supported. Install SafeTensors as "pip install safetensors". When loading, it is automatically determined by extension. Specify use_safetensors options when saving. + - Added an option to add any string before the date and time log directory name log_prefix. + - Cleaning scripts now work without either captions or tags. +* 11/29 (v4): + - DiffUsers 0.9.0 is required. Update as "pip install -U diffusers[torch]==0.9.0" in the virtual environment, and update the dependent libraries as "pip install --upgrade -r requirements.txt" if other errors occur. + - Compatible with Stable Diffusion v2.0. Add the --v2 option when training (and pre-fetching latents). If you are using 768-v-ema.ckpt or stable-diffusion-2 instead of stable-diffusion-v2-base, add --v_parameterization as well when learning. Learn more about other options. + - The minimum resolution and maximum resolution of the bucket can be specified when pre-fetching latents. + - Corrected the calculation formula for loss (fixed that it was increasing according to the batch size). + - Added options related to the learning rate scheduler. + - So that you can download and learn DiffUsers models directly from Hugging Face. In addition, DiffUsers models can be saved during training. + - Available even if the clean_captions_and_tags.py is only a caption or a tag. + - Other minor fixes such as changing the arguments of the noise scheduler during training. +* 11/23 (v3): + - Added WD14Tagger tagging script. + - A log output function has been added to the fine_tune.py. Also, fixed the double shuffling of data. + - Fixed misspelling of options for each script (caption_extention→caption_extension will work for the time being, even if it remains outdated). diff --git a/diffusers_fine_tuning/README.md b/diffusers_fine_tuning/README.md deleted file mode 100644 index 68156e7..0000000 --- a/diffusers_fine_tuning/README.md +++ /dev/null @@ -1,3 +0,0 @@ -# Diffusers Fine Tuning - -Code has been moved to dedicated repo at: https://github.com/bmaltais/kohya_diffusers_fine_tuning \ No newline at end of file diff --git a/dreambooth.bat b/dreambooth.bat new file mode 100644 index 0000000..57bc144 --- /dev/null +++ b/dreambooth.bat @@ -0,0 +1 @@ +.\venv\Scripts\python.exe .\dreambooth_gui.py \ No newline at end of file diff --git a/dreambooth_gui.py b/dreambooth_gui.py index efd1af6..68f2e48 100644 --- a/dreambooth_gui.py +++ b/dreambooth_gui.py @@ -355,7 +355,7 @@ def train_model( lr_warmup_steps = round(float(int(lr_warmup) * int(max_train_steps) / 100)) print(f'lr_warmup_steps = {lr_warmup_steps}') - run_cmd = f'accelerate launch --num_cpu_threads_per_process={num_cpu_threads_per_process} "train_db_fixed.py"' + run_cmd = f'accelerate launch --num_cpu_threads_per_process={num_cpu_threads_per_process} "train_db.py"' if v2: run_cmd += ' --v2' if v_parameterization: @@ -734,10 +734,10 @@ with interface: shuffle_caption = gr.Checkbox( label='Shuffle caption', value=False ) - save_state = gr.Checkbox(label='Save state', value=False) + save_state = gr.Checkbox(label='Save training state', value=False) with gr.Row(): resume = gr.Textbox( - label='Resume', + label='Resume from saved training state', placeholder='path to "last-state" state folder to resume from', ) resume_button = gr.Button('📂', elem_id='open_folder_small') diff --git a/examples/kohya-1-folders.ps1 b/examples/kohya-1-folders.ps1 index 72660a7..b2cd3b9 100644 --- a/examples/kohya-1-folders.ps1 +++ b/examples/kohya-1-folders.ps1 @@ -32,7 +32,7 @@ Write-Output "Repeats: $repeats" .\venv\Scripts\activate -accelerate launch --num_cpu_threads_per_process $num_cpu_threads_per_process train_db_fixed-ber.py ` +accelerate launch --num_cpu_threads_per_process $num_cpu_threads_per_process train_db.py ` --pretrained_model_name_or_path=$pretrained_model_name_or_path ` --train_data_dir=$data_dir ` --output_dir=$output_dir ` @@ -51,7 +51,7 @@ accelerate launch --num_cpu_threads_per_process $num_cpu_threads_per_process tra # 2nd pass at half the dataset repeat value -accelerate launch --num_cpu_threads_per_process $num_cpu_threads_per_process train_db_fixed.py ` +accelerate launch --num_cpu_threads_per_process $num_cpu_threads_per_process train_db.py ` --pretrained_model_name_or_path=$output_dir"\last.ckpt" ` --train_data_dir=$data_dir ` --output_dir=$output_dir"2" ` @@ -68,7 +68,7 @@ accelerate launch --num_cpu_threads_per_process $num_cpu_threads_per_process tra --dataset_repeats=$([Math]::Ceiling($dataset_repeats/2)) ` --save_precision="fp16" - accelerate launch --num_cpu_threads_per_process $num_cpu_threads_per_process train_db_fixed-ber.py ` + accelerate launch --num_cpu_threads_per_process $num_cpu_threads_per_process train_db.py ` --pretrained_model_name_or_path=$output_dir"\last.ckpt" ` --train_data_dir=$data_dir ` --output_dir=$output_dir"2" ` diff --git a/examples/kohya-3-folders.ps1 b/examples/kohya-3-folders.ps1 index ed754a3..484d6fd 100644 --- a/examples/kohya-3-folders.ps1 +++ b/examples/kohya-3-folders.ps1 @@ -48,7 +48,7 @@ $square_mts = [Math]::Ceiling($square_repeats / $train_batch_size * $epoch) .\venv\Scripts\activate -accelerate launch --num_cpu_threads_per_process $num_cpu_threads_per_process train_db_fixed.py ` +accelerate launch --num_cpu_threads_per_process $num_cpu_threads_per_process train_db.py ` --pretrained_model_name_or_path=$pretrained_model_name_or_path ` --train_data_dir=$landscape_data_dir ` --output_dir=$landscape_output_dir ` @@ -65,7 +65,7 @@ accelerate launch --num_cpu_threads_per_process $num_cpu_threads_per_process tra --dataset_repeats=$dataset_repeats ` --save_precision="fp16" -accelerate launch --num_cpu_threads_per_process $num_cpu_threads_per_process train_db_fixed.py ` +accelerate launch --num_cpu_threads_per_process $num_cpu_threads_per_process train_db.py ` --pretrained_model_name_or_path=$landscape_output_dir"\last.ckpt" ` --train_data_dir=$portrait_data_dir ` --output_dir=$portrait_output_dir ` @@ -82,7 +82,7 @@ accelerate launch --num_cpu_threads_per_process $num_cpu_threads_per_process tra --dataset_repeats=$dataset_repeats ` --save_precision="fp16" -accelerate launch --num_cpu_threads_per_process $num_cpu_threads_per_process train_db_fixed.py ` +accelerate launch --num_cpu_threads_per_process $num_cpu_threads_per_process train_db.py ` --pretrained_model_name_or_path=$portrait_output_dir"\last.ckpt" ` --train_data_dir=$square_data_dir ` --output_dir=$square_output_dir ` @@ -101,7 +101,7 @@ accelerate launch --num_cpu_threads_per_process $num_cpu_threads_per_process tra # 2nd pass at half the dataset repeat value -accelerate launch --num_cpu_threads_per_process $num_cpu_threads_per_process train_db_fixed.py ` +accelerate launch --num_cpu_threads_per_process $num_cpu_threads_per_process train_db.py ` --pretrained_model_name_or_path=$square_output_dir"\last.ckpt" ` --train_data_dir=$landscape_data_dir ` --output_dir=$landscape_output_dir"2" ` @@ -118,7 +118,7 @@ accelerate launch --num_cpu_threads_per_process $num_cpu_threads_per_process tra --dataset_repeats=$([Math]::Ceiling($dataset_repeats/2)) ` --save_precision="fp16" -accelerate launch --num_cpu_threads_per_process $num_cpu_threads_per_process train_db_fixed.py ` +accelerate launch --num_cpu_threads_per_process $num_cpu_threads_per_process train_db.py ` --pretrained_model_name_or_path=$landscape_output_dir"2\last.ckpt" ` --train_data_dir=$portrait_data_dir ` --output_dir=$portrait_output_dir"2" ` @@ -135,7 +135,7 @@ accelerate launch --num_cpu_threads_per_process $num_cpu_threads_per_process tra --dataset_repeats=$([Math]::Ceiling($dataset_repeats/2)) ` --save_precision="fp16" -accelerate launch --num_cpu_threads_per_process $num_cpu_threads_per_process train_db_fixed.py ` +accelerate launch --num_cpu_threads_per_process $num_cpu_threads_per_process train_db.py ` --pretrained_model_name_or_path=$portrait_output_dir"2\last.ckpt" ` --train_data_dir=$square_data_dir ` --output_dir=$square_output_dir"2" ` diff --git a/examples/kohya.ps1 b/examples/kohya.ps1 index b6b7f0b..874221c 100644 --- a/examples/kohya.ps1 +++ b/examples/kohya.ps1 @@ -48,7 +48,7 @@ $square_mts = [Math]::Ceiling($square_repeats / $train_batch_size * $epoch) .\venv\Scripts\activate -accelerate launch --num_cpu_threads_per_process $num_cpu_threads_per_process train_db_fixed.py ` +accelerate launch --num_cpu_threads_per_process $num_cpu_threads_per_process train_db.py ` --pretrained_model_name_or_path=$pretrained_model_name_or_path ` --train_data_dir=$landscape_data_dir ` --output_dir=$landscape_output_dir ` @@ -65,7 +65,7 @@ accelerate launch --num_cpu_threads_per_process $num_cpu_threads_per_process tra --dataset_repeats=$dataset_repeats ` --save_half -accelerate launch --num_cpu_threads_per_process $num_cpu_threads_per_process train_db_fixed.py ` +accelerate launch --num_cpu_threads_per_process $num_cpu_threads_per_process train_db.py ` --pretrained_model_name_or_path=$landscape_output_dir"\last.ckpt" ` --train_data_dir=$portrait_data_dir ` --output_dir=$portrait_output_dir ` @@ -82,7 +82,7 @@ accelerate launch --num_cpu_threads_per_process $num_cpu_threads_per_process tra --dataset_repeats=$dataset_repeats ` --save_half -accelerate launch --num_cpu_threads_per_process $num_cpu_threads_per_process train_db_fixed.py ` +accelerate launch --num_cpu_threads_per_process $num_cpu_threads_per_process train_db.py ` --pretrained_model_name_or_path=$portrait_output_dir"\last.ckpt" ` --train_data_dir=$square_data_dir ` --output_dir=$square_output_dir ` @@ -101,7 +101,7 @@ accelerate launch --num_cpu_threads_per_process $num_cpu_threads_per_process tra # 2nd pass at half the dataset repeat value -accelerate launch --num_cpu_threads_per_process $num_cpu_threads_per_process train_db_fixed.py ` +accelerate launch --num_cpu_threads_per_process $num_cpu_threads_per_process train_db.py ` --pretrained_model_name_or_path=$square_output_dir"\last.ckpt" ` --train_data_dir=$landscape_data_dir ` --output_dir=$landscape_output_dir"2" ` @@ -118,7 +118,7 @@ accelerate launch --num_cpu_threads_per_process $num_cpu_threads_per_process tra --dataset_repeats=$([Math]::Ceiling($dataset_repeats/2)) ` --save_half -accelerate launch --num_cpu_threads_per_process $num_cpu_threads_per_process train_db_fixed.py ` +accelerate launch --num_cpu_threads_per_process $num_cpu_threads_per_process train_db.py ` --pretrained_model_name_or_path=$landscape_output_dir"2\last.ckpt" ` --train_data_dir=$portrait_data_dir ` --output_dir=$portrait_output_dir"2" ` @@ -135,7 +135,7 @@ accelerate launch --num_cpu_threads_per_process $num_cpu_threads_per_process tra --dataset_repeats=$([Math]::Ceiling($dataset_repeats/2)) ` --save_half -accelerate launch --num_cpu_threads_per_process $num_cpu_threads_per_process train_db_fixed.py ` +accelerate launch --num_cpu_threads_per_process $num_cpu_threads_per_process train_db.py ` --pretrained_model_name_or_path=$portrait_output_dir"2\last.ckpt" ` --train_data_dir=$square_data_dir ` --output_dir=$square_output_dir"2" ` diff --git a/examples/kohya_bucket.ps1 b/examples/kohya_bucket.ps1 deleted file mode 100644 index f6cfef3..0000000 --- a/examples/kohya_bucket.ps1 +++ /dev/null @@ -1,69 +0,0 @@ -# This powershell script will create a model using the fine tuning dreambooth method. It will require landscape, -# portrait and square images. -# -# Adjust the script to your own needs - -# Sylvia Ritter -# variable values -$pretrained_model_name_or_path = "D:\models\v1-5-pruned-mse-vae.ckpt" -$train_dir = "D:\dreambooth\train_bernard\v3" -$folder_name = "dataset" - -$learning_rate = 1e-6 -$dataset_repeats = 80 -$train_batch_size = 6 -$epoch = 1 -$save_every_n_epochs=1 -$mixed_precision="fp16" -$num_cpu_threads_per_process=6 - - -# You should not have to change values past this point - -$data_dir = $train_dir + "\" + $folder_name -$output_dir = $train_dir + "\model" - -# stop script on error -$ErrorActionPreference = "Stop" - -.\venv\Scripts\activate - -$data_dir_buckets = $data_dir + "-buckets" - -python .\diffusers_fine_tuning\create_buckets.py $data_dir $data_dir_buckets --max_resolution "768,512" - -foreach($directory in Get-ChildItem -path $data_dir_buckets -Directory) - -{ - if (Test-Path -Path $output_dir-$directory) - { - Write-Host "The folder $output_dir-$directory already exists, skipping bucket." - } - else - { - Write-Host $directory - $dir_img_num = Get-ChildItem "$data_dir_buckets\$directory" -Recurse -File -Include *.jpg | Measure-Object | %{$_.Count} - $repeats = $dir_img_num * $dataset_repeats - $mts = [Math]::Ceiling($repeats / $train_batch_size * $epoch) - - Write-Host - - accelerate launch --num_cpu_threads_per_process $num_cpu_threads_per_process train_db_fixed-ber.py ` - --pretrained_model_name_or_path=$pretrained_model_name_or_path ` - --train_data_dir=$data_dir_buckets\$directory ` - --output_dir=$output_dir-$directory ` - --resolution=$directory ` - --train_batch_size=$train_batch_size ` - --learning_rate=$learning_rate ` - --max_train_steps=$mts ` - --use_8bit_adam ` - --xformers ` - --mixed_precision=$mixed_precision ` - --save_every_n_epochs=$save_every_n_epochs ` - --fine_tuning ` - --dataset_repeats=$dataset_repeats ` - --save_precision="fp16" - } - - $pretrained_model_name_or_path = "$output_dir-$directory\last.ckpt" -} \ No newline at end of file diff --git a/examples/kohya_diffuser.ps1 b/examples/kohya_diffuser.ps1 deleted file mode 100644 index a12b20f..0000000 --- a/examples/kohya_diffuser.ps1 +++ /dev/null @@ -1,72 +0,0 @@ -# Sylvia Ritter. AKA: by silvery trait - -# variable values -$pretrained_model_name_or_path = "D:\models\v1-5-pruned-mse-vae.ckpt" -$train_dir = "D:\dreambooth\train_sylvia_ritter\raw_data" -$training_folder = "all-images-v3" - -$learning_rate = 5e-6 -$dataset_repeats = 40 -$train_batch_size = 6 -$epoch = 4 -$save_every_n_epochs=1 -$mixed_precision="bf16" -$num_cpu_threads_per_process=6 - -$max_resolution = "768,576" - -# You should not have to change values past this point - -# stop script on error -$ErrorActionPreference = "Stop" - -# activate venv -.\venv\Scripts\activate - -# create caption json file -python D:\kohya_ss\diffusers_fine_tuning\merge_captions_to_metadata.py ` ---caption_extention ".txt" $train_dir"\"$training_folder $train_dir"\meta_cap.json" - -# create images buckets -python D:\kohya_ss\diffusers_fine_tuning\prepare_buckets_latents.py ` - $train_dir"\"$training_folder ` - $train_dir"\meta_cap.json" ` - $train_dir"\meta_lat.json" ` - $pretrained_model_name_or_path ` - --batch_size 4 --max_resolution $max_resolution --mixed_precision fp16 - -# Get number of valid images -$image_num = Get-ChildItem "$train_dir\$training_folder" -Recurse -File -Include *.npz | Measure-Object | %{$_.Count} -$repeats = $image_num * $dataset_repeats - -# calculate max_train_set -$max_train_set = [Math]::Ceiling($repeats / $train_batch_size * $epoch) - - -accelerate launch --num_cpu_threads_per_process $num_cpu_threads_per_process D:\kohya_ss\diffusers_fine_tuning\fine_tune.py ` - --pretrained_model_name_or_path=$pretrained_model_name_or_path ` - --in_json $train_dir"\meta_lat.json" ` - --train_data_dir=$train_dir"\"$training_folder ` - --output_dir=$train_dir"\fine_tuned" ` - --train_batch_size=$train_batch_size ` - --dataset_repeats=$dataset_repeats ` - --learning_rate=$learning_rate ` - --max_train_steps=$max_train_set ` - --use_8bit_adam --xformers ` - --mixed_precision=$mixed_precision ` - --save_every_n_epochs=$save_every_n_epochs ` - --save_precision="fp16" - -accelerate launch --num_cpu_threads_per_process $num_cpu_threads_per_process D:\kohya_ss\diffusers_fine_tuning\fine_tune.py ` - --pretrained_model_name_or_path=$train_dir"\fine_tuned\last.ckpt" ` - --in_json $train_dir"\meta_lat.json" ` - --train_data_dir=$train_dir"\"$training_folder ` - --output_dir=$train_dir"\fine_tuned2" ` - --train_batch_size=$train_batch_size ` - --dataset_repeats=$([Math]::Ceiling($dataset_repeats / 2)) ` - --learning_rate=$learning_rate ` - --max_train_steps=$([Math]::Ceiling($max_train_set / 2)) ` - --use_8bit_adam --xformers ` - --mixed_precision=$mixed_precision ` - --save_every_n_epochs=$save_every_n_epochs ` - --save_precision="fp16" diff --git a/examples/kohya_finetune.ps1 b/examples/kohya_finetune.ps1 new file mode 100644 index 0000000..4959157 --- /dev/null +++ b/examples/kohya_finetune.ps1 @@ -0,0 +1,153 @@ +# variables related to the pretrained model +$pretrained_model_name_or_path = "D:\models\test\samdoesart2\model\last" +$v2 = 1 # set to 1 for true or 0 for false +$v_model = 0 # set to 1 for true or 0 for false + +# variables related to the training dataset and output directory +$train_dir = "D:\models\test\samdoesart2" +$image_folder = "D:\dataset\samdoesart2\raw" +$output_dir = "D:\models\test\samdoesart2\model_e2\" +$max_resolution = "512,512" + +# variables related to the training process +$learning_rate = 1e-6 +$lr_scheduler = "constant" # Default is constant +$lr_warmup = 0 # % of steps to warmup for 0 - 100. Default is 0. +$dataset_repeats = 40 +$train_batch_size = 8 +$epoch = 1 +$save_every_n_epochs = 1 +$mixed_precision = "bf16" +$save_precision = "fp16" # use fp16 for better compatibility with auto1111 and other repo +$seed = "494481440" +$num_cpu_threads_per_process = 6 +$train_text_encoder = 0 # set to 1 to train text encoder otherwise set to 0 + +# variables related to the resulting diffuser model. If input is ckpt or tensors then it is not applicable +$convert_to_safetensors = 1 # set to 1 to convert resulting diffuser to ckpt +$convert_to_ckpt = 1 # set to 1 to convert resulting diffuser to ckpt + +# other variables +$kohya_finetune_repo_path = "D:\kohya_ss" + +### You should not need to change things below + +# Set variables to useful values using ternary operator +$v_model = ($v_model -eq 0) ? $null : "--v_parameterization" +$v2 = ($v2 -eq 0) ? $null : "--v2" +$train_text_encoder = ($train_text_encoder -eq 0) ? $null : "--train_text_encoder" + +# stop script on error +$ErrorActionPreference = "Stop" + +# define a list of substrings to search for +$substrings_v2 = "stable-diffusion-2-1-base", "stable-diffusion-2-base" + +# check if $v2 and $v_model are empty and if $pretrained_model_name_or_path contains any of the substrings in the v2 list +if ($v2 -eq $null -and $v_model -eq $null -and ($substrings_v2 | Where-Object { $pretrained_model_name_or_path -match $_ }).Count -gt 0) { + Write-Host("SD v2 model detected. Setting --v2 parameter") + $v2 = "--v2" + $v_model = $null +} + +# define a list of substrings to search for v-objective +$substrings_v_model = "stable-diffusion-2-1", "stable-diffusion-2" + +# check if $v2 and $v_model are empty and if $pretrained_model_name_or_path contains any of the substrings in the v_model list +elseif ($v2 -eq $null -and $v_model -eq $null -and ($substrings_v_model | Where-Object { $pretrained_model_name_or_path -match $_ }).Count -gt 0) { + Write-Host("SD v2 v_model detected. Setting --v2 parameter and --v_parameterization") + $v2 = "--v2" + $v_model = "--v_parameterization" +} + +# activate venv +cd $kohya_finetune_repo_path +.\venv\Scripts\activate + +# create caption json file +if (!(Test-Path -Path $train_dir)) { + New-Item -Path $train_dir -ItemType "directory" +} + +python $kohya_finetune_repo_path\script\merge_captions_to_metadata.py ` + --caption_extention ".txt" $image_folder $train_dir"\meta_cap.json" + +# create images buckets +python $kohya_finetune_repo_path\script\prepare_buckets_latents.py ` + $image_folder ` + $train_dir"\meta_cap.json" ` + $train_dir"\meta_lat.json" ` + $pretrained_model_name_or_path ` + --batch_size 4 --max_resolution $max_resolution --mixed_precision $mixed_precision + +# Get number of valid images +$image_num = Get-ChildItem "$image_folder" -Recurse -File -Include *.npz | Measure-Object | % { $_.Count } + +$repeats = $image_num * $dataset_repeats +Write-Host("Repeats = $repeats") + +# calculate max_train_set +$max_train_set = [Math]::Ceiling($repeats / $train_batch_size * $epoch) +Write-Host("max_train_set = $max_train_set") + +$lr_warmup_steps = [Math]::Round($lr_warmup * $max_train_set / 100) +Write-Host("lr_warmup_steps = $lr_warmup_steps") + +Write-Host("$v2 $v_model") + +accelerate launch --num_cpu_threads_per_process $num_cpu_threads_per_process $kohya_finetune_repo_path\script\fine_tune.py ` + $v2 ` + $v_model ` + --pretrained_model_name_or_path=$pretrained_model_name_or_path ` + --in_json $train_dir\meta_lat.json ` + --train_data_dir="$image_folder" ` + --output_dir=$output_dir ` + --train_batch_size=$train_batch_size ` + --dataset_repeats=$dataset_repeats ` + --learning_rate=$learning_rate ` + --lr_scheduler=$lr_scheduler ` + --lr_warmup_steps=$lr_warmup_steps ` + --max_train_steps=$max_train_set ` + --use_8bit_adam ` + --xformers ` + --mixed_precision=$mixed_precision ` + --save_every_n_epochs=$save_every_n_epochs ` + --seed=$seed ` + $train_text_encoder ` + --save_precision=$save_precision + +# check if $output_dir\last is a directory... therefore it is a diffuser model +if (Test-Path "$output_dir\last" -PathType Container) { + if ($convert_to_ckpt) { + Write-Host("Converting diffuser model $output_dir\last to $output_dir\last.ckpt") + python "$kohya_finetune_repo_path\tools\convert_diffusers20_original_sd.py" ` + $output_dir\last ` + $output_dir\last.ckpt ` + --$save_precision + } + if ($convert_to_safetensors) { + Write-Host("Converting diffuser model $output_dir\last to $output_dir\last.safetensors") + python "$kohya_finetune_repo_path\tools\convert_diffusers20_original_sd.py" ` + $output_dir\last ` + $output_dir\last.safetensors ` + --$save_precision + } +} + +# define a list of substrings to search for inference file +$substrings_sd_model = ".ckpt", ".safetensors" +$matching_extension = foreach ($ext in $substrings_sd_model) { + Get-ChildItem $output_dir -File | Where-Object { $_.Extension -contains $ext } +} + +if ($matching_extension.Count -gt 0) { + # copy the file named "v2-inference.yaml" from the "v2_inference" folder to $output_dir as last.yaml + if ( $v2 -ne $null -and $v_model -ne $null) { + Write-Host("Saving v2-inference-v.yaml as $output_dir\last.yaml") + Copy-Item -Path "$kohya_finetune_repo_path\v2_inference\v2-inference-v.yaml" -Destination "$output_dir\last.yaml" + } + elseif ( $v2 -ne $null ) { + Write-Host("Saving v2-inference.yaml as $output_dir\last.yaml") + Copy-Item -Path "$kohya_finetune_repo_path\v2_inference\v2-inference.yaml" -Destination "$output_dir\last.yaml" + } +} \ No newline at end of file diff --git a/examples/kohya_new-v3.ps1 b/examples/kohya_new-v3.ps1 index 6d2a4ca..2810c37 100644 --- a/examples/kohya_new-v3.ps1 +++ b/examples/kohya_new-v3.ps1 @@ -24,11 +24,11 @@ $ErrorActionPreference = "Stop" .\venv\Scripts\activate # create caption json file -python D:\kohya_ss\diffusers_fine_tuning\merge_captions_to_metadata.py ` +python D:\kohya_ss\finetune\merge_captions_to_metadata.py ` --caption_extention ".txt" $train_dir"\"$training_folder $train_dir"\meta_cap.json" # create images buckets -python D:\kohya_ss\diffusers_fine_tuning\prepare_buckets_latents.py ` +python D:\kohya_ss\finetune\prepare_buckets_latents.py ` $train_dir"\"$training_folder ` $train_dir"\meta_cap.json" ` $train_dir"\meta_lat.json" ` @@ -43,7 +43,7 @@ $repeats = $image_num * $dataset_repeats $max_train_set = [Math]::Ceiling($repeats / $train_batch_size * $epoch) -accelerate launch --num_cpu_threads_per_process $num_cpu_threads_per_process D:\kohya_ss\diffusers_fine_tuning\fine_tune.py ` +accelerate launch --num_cpu_threads_per_process $num_cpu_threads_per_process D:\kohya_ss\finetune\fine_tune.py ` --pretrained_model_name_or_path=$pretrained_model_name_or_path ` --in_json $train_dir"\meta_lat.json" ` --train_data_dir=$train_dir"\"$training_folder ` @@ -58,7 +58,7 @@ accelerate launch --num_cpu_threads_per_process $num_cpu_threads_per_process D:\ --train_text_encoder ` --save_precision="fp16" -accelerate launch --num_cpu_threads_per_process $num_cpu_threads_per_process D:\kohya_ss\diffusers_fine_tuning\fine_tune_v1-ber.py ` +accelerate launch --num_cpu_threads_per_process $num_cpu_threads_per_process D:\kohya_ss\finetune\fine_tune.py ` --pretrained_model_name_or_path=$train_dir"\fine_tuned\last.ckpt" ` --in_json $train_dir"\meta_lat.json" ` --train_data_dir=$train_dir"\"$training_folder ` @@ -74,7 +74,7 @@ accelerate launch --num_cpu_threads_per_process $num_cpu_threads_per_process D:\ # Hypernetwork -accelerate launch --num_cpu_threads_per_process $num_cpu_threads_per_process D:\kohya_ss\diffusers_fine_tuning\fine_tune_v1-ber.py ` +accelerate launch --num_cpu_threads_per_process $num_cpu_threads_per_process D:\kohya_ss\finetune\fine_tune.py ` --pretrained_model_name_or_path=$pretrained_model_name_or_path ` --in_json $train_dir"\meta_lat.json" ` --train_data_dir=$train_dir"\"$training_folder ` diff --git a/examples/kohya_train_db_fixed_with-reg_SDv2 512 base.ps1 b/examples/kohya_train_db_fixed_with-reg_SDv2 512 base.ps1 index 80d82a3..28aa1e7 100644 --- a/examples/kohya_train_db_fixed_with-reg_SDv2 512 base.ps1 +++ b/examples/kohya_train_db_fixed_with-reg_SDv2 512 base.ps1 @@ -8,7 +8,7 @@ $pretrained_model_name_or_path = "D:\models\512-base-ema.ckpt" $data_dir = "D:\models\dariusz_zawadzki\kohya_reg\data" $reg_data_dir = "D:\models\dariusz_zawadzki\kohya_reg\reg" $logging_dir = "D:\models\dariusz_zawadzki\logs" -$output_dir = "D:\models\dariusz_zawadzki\train_db_fixed_model_reg_v2" +$output_dir = "D:\models\dariusz_zawadzki\train_db_model_reg_v2" $resolution = "512,512" $lr_scheduler="polynomial" $cache_latents = 1 # 1 = true, 0 = false @@ -41,7 +41,7 @@ Write-Output "Repeats: $repeats" cd D:\kohya_ss .\venv\Scripts\activate -accelerate launch --num_cpu_threads_per_process $num_cpu_threads_per_process train_db_fixed.py ` +accelerate launch --num_cpu_threads_per_process $num_cpu_threads_per_process train_db.py ` --v2 ` --pretrained_model_name_or_path=$pretrained_model_name_or_path ` --train_data_dir=$data_dir ` diff --git a/examples/test.ps1 b/examples/test.ps1 deleted file mode 100644 index 63603b9..0000000 --- a/examples/test.ps1 +++ /dev/null @@ -1,6 +0,0 @@ -$date = Read-Host "Enter the date (yyyy-mm-dd):" -Prompt "Invalid date format. Please try again (yyyy-mm-dd):" -ValidateScript { - # Parse the date input and return $true if it is in the correct format, - # or $false if it is not - $date = [DateTime]::Parse($_) - return $date -ne $null -} \ No newline at end of file diff --git a/fine_tune.py b/fine_tune.py new file mode 100644 index 0000000..4795edd --- /dev/null +++ b/fine_tune.py @@ -0,0 +1,1059 @@ +# v2: select precision for saved checkpoint +# v3: add logging for tensorboard, fix to shuffle=False in DataLoader (shuffling is in dataset) +# v4: support SD2.0, add lr scheduler options, supports save_every_n_epochs and save_state for DiffUsers model +# v5: refactor to use model_util, support safetensors, add settings to use Diffusers' xformers, add log prefix +# v6: model_util update +# v7: support Diffusers 0.10.0 (v-parameterization training, safetensors in Diffusers) and accelerate 0.15.0, support full path in metadata +# v8: experimental full fp16 training. +# v9: add keep_tokens and save_model_as option, flip augmentation + +# このスクリプトのライセンスは、train_dreambooth.pyと同じくApache License 2.0とします +# License: +# Copyright 2022 Kohya S. @kohya_ss +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +# License of included scripts: + +# Diffusers: ASL 2.0 https://github.com/huggingface/diffusers/blob/main/LICENSE + +# Memory efficient attention: +# based on https://github.com/lucidrains/memory-efficient-attention-pytorch/blob/main/memory_efficient_attention_pytorch/flash_attention.py +# MIT https://github.com/lucidrains/memory-efficient-attention-pytorch/blob/main/LICENSE + +import argparse +import math +import os +import random +import json +import importlib +import time + +from tqdm import tqdm +import torch +from accelerate import Accelerator +from accelerate.utils import set_seed +from transformers import CLIPTokenizer +import diffusers +from diffusers import DDPMScheduler, StableDiffusionPipeline +import numpy as np +from einops import rearrange +from torch import einsum + +import library.model_util as model_util + +# Tokenizer: checkpointから読み込むのではなくあらかじめ提供されているものを使う +TOKENIZER_PATH = "openai/clip-vit-large-patch14" +V2_STABLE_DIFFUSION_PATH = "stabilityai/stable-diffusion-2" # ここからtokenizerだけ使う v2とv2.1はtokenizer仕様は同じ + +# checkpointファイル名 +EPOCH_STATE_NAME = "epoch-{:06d}-state" +LAST_STATE_NAME = "last-state" + +LAST_DIFFUSERS_DIR_NAME = "last" +EPOCH_DIFFUSERS_DIR_NAME = "epoch-{:06d}" + + +def collate_fn(examples): + return examples[0] + + +class FineTuningDataset(torch.utils.data.Dataset): + def __init__(self, metadata, train_data_dir, batch_size, tokenizer, max_token_length, shuffle_caption, shuffle_keep_tokens, dataset_repeats, debug) -> None: + super().__init__() + + self.metadata = metadata + self.train_data_dir = train_data_dir + self.batch_size = batch_size + self.tokenizer: CLIPTokenizer = tokenizer + self.max_token_length = max_token_length + self.shuffle_caption = shuffle_caption + self.shuffle_keep_tokens = shuffle_keep_tokens + self.debug = debug + + self.tokenizer_max_length = self.tokenizer.model_max_length if max_token_length is None else max_token_length + 2 + + print("make buckets") + + # 最初に数を数える + self.bucket_resos = set() + for img_md in metadata.values(): + if 'train_resolution' in img_md: + self.bucket_resos.add(tuple(img_md['train_resolution'])) + self.bucket_resos = list(self.bucket_resos) + self.bucket_resos.sort() + print(f"number of buckets: {len(self.bucket_resos)}") + + reso_to_index = {} + for i, reso in enumerate(self.bucket_resos): + reso_to_index[reso] = i + + # bucketに割り当てていく + self.buckets = [[] for _ in range(len(self.bucket_resos))] + n = 1 if dataset_repeats is None else dataset_repeats + images_count = 0 + for image_key, img_md in metadata.items(): + if 'train_resolution' not in img_md: + continue + if not os.path.exists(self.image_key_to_npz_file(image_key)): + continue + + reso = tuple(img_md['train_resolution']) + for _ in range(n): + self.buckets[reso_to_index[reso]].append(image_key) + images_count += n + + # 参照用indexを作る + self.buckets_indices = [] + for bucket_index, bucket in enumerate(self.buckets): + batch_count = int(math.ceil(len(bucket) / self.batch_size)) + for batch_index in range(batch_count): + self.buckets_indices.append((bucket_index, batch_index)) + + self.shuffle_buckets() + self._length = len(self.buckets_indices) + self.images_count = images_count + + def show_buckets(self): + for i, (reso, bucket) in enumerate(zip(self.bucket_resos, self.buckets)): + print(f"bucket {i}: resolution {reso}, count: {len(bucket)}") + + def shuffle_buckets(self): + random.shuffle(self.buckets_indices) + for bucket in self.buckets: + random.shuffle(bucket) + + def image_key_to_npz_file(self, image_key): + npz_file_norm = os.path.splitext(image_key)[0] + '.npz' + if os.path.exists(npz_file_norm): + if random.random() < .5: + npz_file_flip = os.path.splitext(image_key)[0] + '_flip.npz' + if os.path.exists(npz_file_flip): + return npz_file_flip + return npz_file_norm + + npz_file_norm = os.path.join(self.train_data_dir, image_key + '.npz') + if random.random() < .5: + npz_file_flip = os.path.join(self.train_data_dir, image_key + '_flip.npz') + if os.path.exists(npz_file_flip): + return npz_file_flip + return npz_file_norm + + def load_latent(self, image_key): + return np.load(self.image_key_to_npz_file(image_key))['arr_0'] + + def __len__(self): + return self._length + + def __getitem__(self, index): + if index == 0: + self.shuffle_buckets() + + bucket = self.buckets[self.buckets_indices[index][0]] + image_index = self.buckets_indices[index][1] * self.batch_size + + input_ids_list = [] + latents_list = [] + captions = [] + for image_key in bucket[image_index:image_index + self.batch_size]: + img_md = self.metadata[image_key] + caption = img_md.get('caption') + tags = img_md.get('tags') + + if caption is None: + caption = tags + elif tags is not None and len(tags) > 0: + caption = caption + ', ' + tags + assert caption is not None and len(caption) > 0, f"caption or tag is required / キャプションまたはタグは必須です:{image_key}" + + latents = self.load_latent(image_key) + + if self.shuffle_caption: + tokens = caption.strip().split(",") + if self.shuffle_keep_tokens is None: + random.shuffle(tokens) + else: + if len(tokens) > self.shuffle_keep_tokens: + keep_tokens = tokens[:self.shuffle_keep_tokens] + tokens = tokens[self.shuffle_keep_tokens:] + random.shuffle(tokens) + tokens = keep_tokens + tokens + caption = ",".join(tokens).strip() + + captions.append(caption) + + input_ids = self.tokenizer(caption, padding="max_length", truncation=True, + max_length=self.tokenizer_max_length, return_tensors="pt").input_ids + + if self.tokenizer_max_length > self.tokenizer.model_max_length: + input_ids = input_ids.squeeze(0) + iids_list = [] + if self.tokenizer.pad_token_id == self.tokenizer.eos_token_id: + # v1 + # 77以上の時は " .... " でトータル227とかになっているので、"..."の三連に変換する + # 1111氏のやつは , で区切る、とかしているようだが とりあえず単純に + for i in range(1, self.tokenizer_max_length - self.tokenizer.model_max_length + 2, self.tokenizer.model_max_length - 2): # (1, 152, 75) + ids_chunk = (input_ids[0].unsqueeze(0), + input_ids[i:i + self.tokenizer.model_max_length - 2], + input_ids[-1].unsqueeze(0)) + ids_chunk = torch.cat(ids_chunk) + iids_list.append(ids_chunk) + else: + # v2 + # 77以上の時は " .... ..." でトータル227とかになっているので、"... ..."の三連に変換する + for i in range(1, self.tokenizer_max_length - self.tokenizer.model_max_length + 2, self.tokenizer.model_max_length - 2): + ids_chunk = (input_ids[0].unsqueeze(0), # BOS + input_ids[i:i + self.tokenizer.model_max_length - 2], + input_ids[-1].unsqueeze(0)) # PAD or EOS + ids_chunk = torch.cat(ids_chunk) + + # 末尾が または の場合は、何もしなくてよい + # 末尾が x の場合は末尾を に変える(x なら結果的に変化なし) + if ids_chunk[-2] != self.tokenizer.eos_token_id and ids_chunk[-2] != self.tokenizer.pad_token_id: + ids_chunk[-1] = self.tokenizer.eos_token_id + # 先頭が ... の場合は ... に変える + if ids_chunk[1] == self.tokenizer.pad_token_id: + ids_chunk[1] = self.tokenizer.eos_token_id + + iids_list.append(ids_chunk) + + input_ids = torch.stack(iids_list) # 3,77 + + input_ids_list.append(input_ids) + latents_list.append(torch.FloatTensor(latents)) + + example = {} + example['input_ids'] = torch.stack(input_ids_list) + example['latents'] = torch.stack(latents_list) + if self.debug: + example['image_keys'] = bucket[image_index:image_index + self.batch_size] + example['captions'] = captions + return example + + +def save_hypernetwork(output_file, hypernetwork): + state_dict = hypernetwork.get_state_dict() + torch.save(state_dict, output_file) + + +def train(args): + fine_tuning = args.hypernetwork_module is None # fine tuning or hypernetwork training + + # その他のオプション設定を確認する + if args.v_parameterization and not args.v2: + print("v_parameterization should be with v2 / v1でv_parameterizationを使用することは想定されていません") + if args.v2 and args.clip_skip is not None: + print("v2 with clip_skip will be unexpected / v2でclip_skipを使用することは想定されていません") + + # モデル形式のオプション設定を確認する + load_stable_diffusion_format = os.path.isfile(args.pretrained_model_name_or_path) + + if load_stable_diffusion_format: + src_stable_diffusion_ckpt = args.pretrained_model_name_or_path + src_diffusers_model_path = None + else: + src_stable_diffusion_ckpt = None + src_diffusers_model_path = args.pretrained_model_name_or_path + + if args.save_model_as is None: + save_stable_diffusion_format = load_stable_diffusion_format + use_safetensors = args.use_safetensors + else: + save_stable_diffusion_format = args.save_model_as.lower() == 'ckpt' or args.save_model_as.lower() == 'safetensors' + use_safetensors = args.use_safetensors or ("safetensors" in args.save_model_as.lower()) + + # 乱数系列を初期化する + if args.seed is not None: + set_seed(args.seed) + + # メタデータを読み込む + if os.path.exists(args.in_json): + print(f"loading existing metadata: {args.in_json}") + with open(args.in_json, "rt", encoding='utf-8') as f: + metadata = json.load(f) + else: + print(f"no metadata / メタデータファイルがありません: {args.in_json}") + return + + # tokenizerを読み込む + print("prepare tokenizer") + if args.v2: + tokenizer = CLIPTokenizer.from_pretrained(V2_STABLE_DIFFUSION_PATH, subfolder="tokenizer") + else: + tokenizer = CLIPTokenizer.from_pretrained(TOKENIZER_PATH) + + if args.max_token_length is not None: + print(f"update token length: {args.max_token_length}") + + # datasetを用意する + print("prepare dataset") + train_dataset = FineTuningDataset(metadata, args.train_data_dir, args.train_batch_size, + tokenizer, args.max_token_length, args.shuffle_caption, args.keep_tokens, + args.dataset_repeats, args.debug_dataset) + + print(f"Total dataset length / データセットの長さ: {len(train_dataset)}") + print(f"Total images / 画像数: {train_dataset.images_count}") + + if len(train_dataset) == 0: + print("No data found. Please verify the metadata file and train_data_dir option. / 画像がありません。メタデータおよびtrain_data_dirオプションを確認してください。") + return + + if args.debug_dataset: + train_dataset.show_buckets() + i = 0 + for example in train_dataset: + print(f"image: {example['image_keys']}") + print(f"captions: {example['captions']}") + print(f"latents: {example['latents'].shape}") + print(f"input_ids: {example['input_ids'].shape}") + print(example['input_ids']) + i += 1 + if i >= 8: + break + return + + # acceleratorを準備する + print("prepare accelerator") + if args.logging_dir is None: + log_with = None + logging_dir = None + else: + log_with = "tensorboard" + log_prefix = "" if args.log_prefix is None else args.log_prefix + logging_dir = args.logging_dir + "/" + log_prefix + time.strftime('%Y%m%d%H%M%S', time.localtime()) + accelerator = Accelerator(gradient_accumulation_steps=args.gradient_accumulation_steps, + mixed_precision=args.mixed_precision, log_with=log_with, logging_dir=logging_dir) + + # accelerateの互換性問題を解決する + accelerator_0_15 = True + try: + accelerator.unwrap_model("dummy", True) + print("Using accelerator 0.15.0 or above.") + except TypeError: + accelerator_0_15 = False + + def unwrap_model(model): + if accelerator_0_15: + return accelerator.unwrap_model(model, True) + return accelerator.unwrap_model(model) + + # mixed precisionに対応した型を用意しておき適宜castする + weight_dtype = torch.float32 + if args.mixed_precision == "fp16": + weight_dtype = torch.float16 + elif args.mixed_precision == "bf16": + weight_dtype = torch.bfloat16 + + save_dtype = None + if args.save_precision == "fp16": + save_dtype = torch.float16 + elif args.save_precision == "bf16": + save_dtype = torch.bfloat16 + elif args.save_precision == "float": + save_dtype = torch.float32 + + # モデルを読み込む + if load_stable_diffusion_format: + print("load StableDiffusion checkpoint") + text_encoder, vae, unet = model_util.load_models_from_stable_diffusion_checkpoint(args.v2, args.pretrained_model_name_or_path) + else: + print("load Diffusers pretrained models") + pipe = StableDiffusionPipeline.from_pretrained(args.pretrained_model_name_or_path, tokenizer=None, safety_checker=None) + # , torch_dtype=weight_dtype) ここでtorch_dtypeを指定すると学習時にエラーになる + text_encoder = pipe.text_encoder + unet = pipe.unet + vae = pipe.vae + del pipe + vae.to("cpu") # 保存時にしか使わないので、メモリを開けるためCPUに移しておく + + # Diffusers版のxformers使用フラグを設定する関数 + def set_diffusers_xformers_flag(model, valid): + # model.set_use_memory_efficient_attention_xformers(valid) # 次のリリースでなくなりそう + # pipeが自動で再帰的にset_use_memory_efficient_attention_xformersを探すんだって(;´Д`) + # U-Netだけ使う時にはどうすればいいのか……仕方ないからコピって使うか + # 0.10.2でなんか巻き戻って個別に指定するようになった(;^ω^) + + # Recursively walk through all the children. + # Any children which exposes the set_use_memory_efficient_attention_xformers method + # gets the message + def fn_recursive_set_mem_eff(module: torch.nn.Module): + if hasattr(module, "set_use_memory_efficient_attention_xformers"): + module.set_use_memory_efficient_attention_xformers(valid) + + for child in module.children(): + fn_recursive_set_mem_eff(child) + + fn_recursive_set_mem_eff(model) + + # モデルに xformers とか memory efficient attention を組み込む + if args.diffusers_xformers: + print("Use xformers by Diffusers") + set_diffusers_xformers_flag(unet, True) + else: + # Windows版のxformersはfloatで学習できないのでxformersを使わない設定も可能にしておく必要がある + print("Disable Diffusers' xformers") + set_diffusers_xformers_flag(unet, False) + replace_unet_modules(unet, args.mem_eff_attn, args.xformers) + + if not fine_tuning: + # Hypernetwork + print("import hypernetwork module:", args.hypernetwork_module) + hyp_module = importlib.import_module(args.hypernetwork_module) + + hypernetwork = hyp_module.Hypernetwork() + + if args.hypernetwork_weights is not None: + print("load hypernetwork weights from:", args.hypernetwork_weights) + hyp_sd = torch.load(args.hypernetwork_weights, map_location='cpu') + success = hypernetwork.load_from_state_dict(hyp_sd) + assert success, "hypernetwork weights loading failed." + + print("apply hypernetwork") + hypernetwork.apply_to_diffusers(None, text_encoder, unet) + + # 学習を準備する:モデルを適切な状態にする + training_models = [] + if fine_tuning: + if args.gradient_checkpointing: + unet.enable_gradient_checkpointing() + training_models.append(unet) + + if args.train_text_encoder: + print("enable text encoder training") + if args.gradient_checkpointing: + text_encoder.gradient_checkpointing_enable() + training_models.append(text_encoder) + else: + text_encoder.to(accelerator.device, dtype=weight_dtype) + text_encoder.requires_grad_(False) # text encoderは学習しない + text_encoder.eval() + else: + unet.to(accelerator.device) # , dtype=weight_dtype) # dtypeを指定すると学習できない + unet.requires_grad_(False) + unet.eval() + text_encoder.to(accelerator.device, dtype=weight_dtype) + text_encoder.requires_grad_(False) + text_encoder.eval() + training_models.append(hypernetwork) + + for m in training_models: + m.requires_grad_(True) + params = [] + for m in training_models: + params.extend(m.parameters()) + params_to_optimize = params + + # 学習に必要なクラスを準備する + print("prepare optimizer, data loader etc.") + + # 8-bit Adamを使う + if args.use_8bit_adam: + try: + import bitsandbytes as bnb + except ImportError: + raise ImportError("No bitsand bytes / bitsandbytesがインストールされていないようです") + print("use 8-bit Adam optimizer") + optimizer_class = bnb.optim.AdamW8bit + else: + optimizer_class = torch.optim.AdamW + + # betaやweight decayはdiffusers DreamBoothもDreamBooth SDもデフォルト値のようなのでオプションはとりあえず省略 + optimizer = optimizer_class(params_to_optimize, lr=args.learning_rate) + + # dataloaderを準備する + # DataLoaderのプロセス数:0はメインプロセスになる + n_workers = min(8, os.cpu_count() - 1) # cpu_count-1 ただし最大8 + train_dataloader = torch.utils.data.DataLoader( + train_dataset, batch_size=1, shuffle=False, collate_fn=collate_fn, num_workers=n_workers) + + # lr schedulerを用意する + lr_scheduler = diffusers.optimization.get_scheduler( + args.lr_scheduler, optimizer, num_warmup_steps=args.lr_warmup_steps, num_training_steps=args.max_train_steps * args.gradient_accumulation_steps) + + # acceleratorがなんかよろしくやってくれるらしい + if args.full_fp16: + assert args.mixed_precision == "fp16", "full_fp16 requires mixed precision='fp16' / full_fp16を使う場合はmixed_precision='fp16'を指定してください。" + print("enable full fp16 training.") + + if fine_tuning: + # 実験的機能:勾配も含めたfp16学習を行う モデル全体をfp16にする + if args.full_fp16: + unet.to(weight_dtype) + text_encoder.to(weight_dtype) + + if args.train_text_encoder: + unet, text_encoder, optimizer, train_dataloader, lr_scheduler = accelerator.prepare( + unet, text_encoder, optimizer, train_dataloader, lr_scheduler) + else: + unet, optimizer, train_dataloader, lr_scheduler = accelerator.prepare(unet, optimizer, train_dataloader, lr_scheduler) + else: + if args.full_fp16: + unet.to(weight_dtype) + hypernetwork.to(weight_dtype) + + unet, hypernetwork, optimizer, train_dataloader, lr_scheduler = accelerator.prepare( + unet, hypernetwork, optimizer, train_dataloader, lr_scheduler) + + # 実験的機能:勾配も含めたfp16学習を行う PyTorchにパッチを当ててfp16でのgrad scaleを有効にする + if args.full_fp16: + org_unscale_grads = accelerator.scaler._unscale_grads_ + + def _unscale_grads_replacer(optimizer, inv_scale, found_inf, allow_fp16): + return org_unscale_grads(optimizer, inv_scale, found_inf, True) + + accelerator.scaler._unscale_grads_ = _unscale_grads_replacer + + # TODO accelerateのconfigに指定した型とオプション指定の型とをチェックして異なれば警告を出す + + # resumeする + if args.resume is not None: + print(f"resume training from state: {args.resume}") + accelerator.load_state(args.resume) + + # epoch数を計算する + num_update_steps_per_epoch = math.ceil(len(train_dataloader) / args.gradient_accumulation_steps) + num_train_epochs = math.ceil(args.max_train_steps / num_update_steps_per_epoch) + + # 学習する + total_batch_size = args.train_batch_size * accelerator.num_processes * args.gradient_accumulation_steps + print("running training / 学習開始") + print(f" num examples / サンプル数: {train_dataset.images_count}") + print(f" num batches per epoch / 1epochのバッチ数: {len(train_dataloader)}") + print(f" num epochs / epoch数: {num_train_epochs}") + print(f" batch size per device / バッチサイズ: {args.train_batch_size}") + print(f" total train batch size (with parallel & distributed & accumulation) / 総バッチサイズ(並列学習、勾配合計含む): {total_batch_size}") + print(f" gradient ccumulation steps / 勾配を合計するステップ数 = {args.gradient_accumulation_steps}") + print(f" total optimization steps / 学習ステップ数: {args.max_train_steps}") + + progress_bar = tqdm(range(args.max_train_steps), smoothing=0, disable=not accelerator.is_local_main_process, desc="steps") + global_step = 0 + + # v4で更新:clip_sample=Falseに + # Diffusersのtrain_dreambooth.pyがconfigから持ってくるように変更されたので、clip_sample=Falseになるため、それに合わせる + # 既存の1.4/1.5/2.0/2.1はすべてschdulerのconfigは(クラス名を除いて)同じ + # よくソースを見たら学習時はclip_sampleは関係ないや(;'∀') + noise_scheduler = DDPMScheduler(beta_start=0.00085, beta_end=0.012, beta_schedule="scaled_linear", + num_train_timesteps=1000, clip_sample=False) + + if accelerator.is_main_process: + accelerator.init_trackers("finetuning" if fine_tuning else "hypernetwork") + + # 以下 train_dreambooth.py からほぼコピペ + for epoch in range(num_train_epochs): + print(f"epoch {epoch+1}/{num_train_epochs}") + for m in training_models: + m.train() + + loss_total = 0 + for step, batch in enumerate(train_dataloader): + with accelerator.accumulate(training_models[0]): # 複数モデルに対応していない模様だがとりあえずこうしておく + latents = batch["latents"].to(accelerator.device) + latents = latents * 0.18215 + b_size = latents.shape[0] + + # with torch.no_grad(): + with torch.set_grad_enabled(args.train_text_encoder): + # Get the text embedding for conditioning + input_ids = batch["input_ids"].to(accelerator.device) + input_ids = input_ids.reshape((-1, tokenizer.model_max_length)) # batch_size*3, 77 + + if args.clip_skip is None: + encoder_hidden_states = text_encoder(input_ids)[0] + else: + enc_out = text_encoder(input_ids, output_hidden_states=True, return_dict=True) + encoder_hidden_states = enc_out['hidden_states'][-args.clip_skip] + encoder_hidden_states = text_encoder.text_model.final_layer_norm(encoder_hidden_states) + + # bs*3, 77, 768 or 1024 + encoder_hidden_states = encoder_hidden_states.reshape((b_size, -1, encoder_hidden_states.shape[-1])) + + if args.max_token_length is not None: + if args.v2: + # v2: ... ... の三連を ... ... へ戻す 正直この実装でいいのかわからん + states_list = [encoder_hidden_states[:, 0].unsqueeze(1)] # + for i in range(1, args.max_token_length, tokenizer.model_max_length): + chunk = encoder_hidden_states[:, i:i + tokenizer.model_max_length - 2] # の後から 最後の前まで + if i > 0: + for j in range(len(chunk)): + if input_ids[j, 1] == tokenizer.eos_token: # 空、つまり ...のパターン + chunk[j, 0] = chunk[j, 1] # 次の の値をコピーする + states_list.append(chunk) # の後から の前まで + states_list.append(encoder_hidden_states[:, -1].unsqueeze(1)) # のどちらか + encoder_hidden_states = torch.cat(states_list, dim=1) + else: + # v1: ... の三連を ... へ戻す + states_list = [encoder_hidden_states[:, 0].unsqueeze(1)] # + for i in range(1, args.max_token_length, tokenizer.model_max_length): + states_list.append(encoder_hidden_states[:, i:i + tokenizer.model_max_length - 2]) # の後から の前まで + states_list.append(encoder_hidden_states[:, -1].unsqueeze(1)) # + encoder_hidden_states = torch.cat(states_list, dim=1) + + # Sample noise that we'll add to the latents + noise = torch.randn_like(latents, device=latents.device) + + # Sample a random timestep for each image + timesteps = torch.randint(0, noise_scheduler.config.num_train_timesteps, (b_size,), device=latents.device) + timesteps = timesteps.long() + + # Add noise to the latents according to the noise magnitude at each timestep + # (this is the forward diffusion process) + noisy_latents = noise_scheduler.add_noise(latents, noise, timesteps) + + # Predict the noise residual + noise_pred = unet(noisy_latents, timesteps, encoder_hidden_states).sample + + if args.v_parameterization: + # v-parameterization training + # Diffusers 0.10.0からv_parameterizationの学習に対応したのでそちらを使う + target = noise_scheduler.get_velocity(latents, noise, timesteps) + else: + target = noise + + loss = torch.nn.functional.mse_loss(noise_pred.float(), target.float(), reduction="mean") + + accelerator.backward(loss) + if accelerator.sync_gradients: + params_to_clip = [] + for m in training_models: + params_to_clip.extend(m.parameters()) + accelerator.clip_grad_norm_(params_to_clip, 1.0) # args.max_grad_norm) + + optimizer.step() + lr_scheduler.step() + optimizer.zero_grad(set_to_none=True) + + # Checks if the accelerator has performed an optimization step behind the scenes + if accelerator.sync_gradients: + progress_bar.update(1) + global_step += 1 + + current_loss = loss.detach().item() # 平均なのでbatch sizeは関係ないはず + if args.logging_dir is not None: + logs = {"loss": current_loss, "lr": lr_scheduler.get_last_lr()[0]} + accelerator.log(logs, step=global_step) + + loss_total += current_loss + avr_loss = loss_total / (step+1) + logs = {"loss": avr_loss} # , "lr": lr_scheduler.get_last_lr()[0]} + progress_bar.set_postfix(**logs) + + if global_step >= args.max_train_steps: + break + + if args.logging_dir is not None: + logs = {"epoch_loss": loss_total / len(train_dataloader)} + accelerator.log(logs, step=epoch+1) + + accelerator.wait_for_everyone() + + if args.save_every_n_epochs is not None: + if (epoch + 1) % args.save_every_n_epochs == 0 and (epoch + 1) < num_train_epochs: + print("saving checkpoint.") + os.makedirs(args.output_dir, exist_ok=True) + ckpt_file = os.path.join(args.output_dir, model_util.get_epoch_ckpt_name(use_safetensors, epoch + 1)) + + if fine_tuning: + if save_stable_diffusion_format: + model_util.save_stable_diffusion_checkpoint(args.v2, ckpt_file, unwrap_model(text_encoder), unwrap_model(unet), + src_stable_diffusion_ckpt, epoch + 1, global_step, save_dtype, vae) + else: + out_dir = os.path.join(args.output_dir, EPOCH_DIFFUSERS_DIR_NAME.format(epoch + 1)) + os.makedirs(out_dir, exist_ok=True) + model_util.save_diffusers_checkpoint(args.v2, out_dir, unwrap_model(text_encoder), unwrap_model(unet), + src_diffusers_model_path, vae=vae, use_safetensors=use_safetensors) + else: + save_hypernetwork(ckpt_file, unwrap_model(hypernetwork)) + + if args.save_state: + print("saving state.") + accelerator.save_state(os.path.join(args.output_dir, EPOCH_STATE_NAME.format(epoch + 1))) + + is_main_process = accelerator.is_main_process + if is_main_process: + if fine_tuning: + unet = unwrap_model(unet) + text_encoder = unwrap_model(text_encoder) + else: + hypernetwork = unwrap_model(hypernetwork) + + accelerator.end_training() + + if args.save_state: + print("saving last state.") + accelerator.save_state(os.path.join(args.output_dir, LAST_STATE_NAME)) + + del accelerator # この後メモリを使うのでこれは消す + + if is_main_process: + os.makedirs(args.output_dir, exist_ok=True) + ckpt_file = os.path.join(args.output_dir, model_util.get_last_ckpt_name(use_safetensors)) + + if fine_tuning: + if save_stable_diffusion_format: + print(f"save trained model as StableDiffusion checkpoint to {ckpt_file}") + model_util.save_stable_diffusion_checkpoint(args.v2, ckpt_file, text_encoder, unet, + src_stable_diffusion_ckpt, epoch, global_step, save_dtype, vae) + else: + # Create the pipeline using using the trained modules and save it. + print(f"save trained model as Diffusers to {args.output_dir}") + out_dir = os.path.join(args.output_dir, LAST_DIFFUSERS_DIR_NAME) + os.makedirs(out_dir, exist_ok=True) + model_util.save_diffusers_checkpoint(args.v2, out_dir, text_encoder, unet, + src_diffusers_model_path, vae=vae, use_safetensors=use_safetensors) + else: + print(f"save trained model to {ckpt_file}") + save_hypernetwork(ckpt_file, hypernetwork) + + print("model saved.") + + +# region モジュール入れ替え部 +""" +高速化のためのモジュール入れ替え +""" + +# FlashAttentionを使うCrossAttention +# based on https://github.com/lucidrains/memory-efficient-attention-pytorch/blob/main/memory_efficient_attention_pytorch/flash_attention.py +# LICENSE MIT https://github.com/lucidrains/memory-efficient-attention-pytorch/blob/main/LICENSE + +# constants + +EPSILON = 1e-6 + +# helper functions + + +def exists(val): + return val is not None + + +def default(val, d): + return val if exists(val) else d + +# flash attention forwards and backwards + +# https://arxiv.org/abs/2205.14135 + + +class FlashAttentionFunction(torch.autograd.function.Function): + @ staticmethod + @ torch.no_grad() + def forward(ctx, q, k, v, mask, causal, q_bucket_size, k_bucket_size): + """ Algorithm 2 in the paper """ + + device = q.device + dtype = q.dtype + max_neg_value = -torch.finfo(q.dtype).max + qk_len_diff = max(k.shape[-2] - q.shape[-2], 0) + + o = torch.zeros_like(q) + all_row_sums = torch.zeros((*q.shape[:-1], 1), dtype=dtype, device=device) + all_row_maxes = torch.full((*q.shape[:-1], 1), max_neg_value, dtype=dtype, device=device) + + scale = (q.shape[-1] ** -0.5) + + if not exists(mask): + mask = (None,) * math.ceil(q.shape[-2] / q_bucket_size) + else: + mask = rearrange(mask, 'b n -> b 1 1 n') + mask = mask.split(q_bucket_size, dim=-1) + + row_splits = zip( + q.split(q_bucket_size, dim=-2), + o.split(q_bucket_size, dim=-2), + mask, + all_row_sums.split(q_bucket_size, dim=-2), + all_row_maxes.split(q_bucket_size, dim=-2), + ) + + for ind, (qc, oc, row_mask, row_sums, row_maxes) in enumerate(row_splits): + q_start_index = ind * q_bucket_size - qk_len_diff + + col_splits = zip( + k.split(k_bucket_size, dim=-2), + v.split(k_bucket_size, dim=-2), + ) + + for k_ind, (kc, vc) in enumerate(col_splits): + k_start_index = k_ind * k_bucket_size + + attn_weights = einsum('... i d, ... j d -> ... i j', qc, kc) * scale + + if exists(row_mask): + attn_weights.masked_fill_(~row_mask, max_neg_value) + + if causal and q_start_index < (k_start_index + k_bucket_size - 1): + causal_mask = torch.ones((qc.shape[-2], kc.shape[-2]), dtype=torch.bool, + device=device).triu(q_start_index - k_start_index + 1) + attn_weights.masked_fill_(causal_mask, max_neg_value) + + block_row_maxes = attn_weights.amax(dim=-1, keepdims=True) + attn_weights -= block_row_maxes + exp_weights = torch.exp(attn_weights) + + if exists(row_mask): + exp_weights.masked_fill_(~row_mask, 0.) + + block_row_sums = exp_weights.sum(dim=-1, keepdims=True).clamp(min=EPSILON) + + new_row_maxes = torch.maximum(block_row_maxes, row_maxes) + + exp_values = einsum('... i j, ... j d -> ... i d', exp_weights, vc) + + exp_row_max_diff = torch.exp(row_maxes - new_row_maxes) + exp_block_row_max_diff = torch.exp(block_row_maxes - new_row_maxes) + + new_row_sums = exp_row_max_diff * row_sums + exp_block_row_max_diff * block_row_sums + + oc.mul_((row_sums / new_row_sums) * exp_row_max_diff).add_((exp_block_row_max_diff / new_row_sums) * exp_values) + + row_maxes.copy_(new_row_maxes) + row_sums.copy_(new_row_sums) + + ctx.args = (causal, scale, mask, q_bucket_size, k_bucket_size) + ctx.save_for_backward(q, k, v, o, all_row_sums, all_row_maxes) + + return o + + @ staticmethod + @ torch.no_grad() + def backward(ctx, do): + """ Algorithm 4 in the paper """ + + causal, scale, mask, q_bucket_size, k_bucket_size = ctx.args + q, k, v, o, l, m = ctx.saved_tensors + + device = q.device + + max_neg_value = -torch.finfo(q.dtype).max + qk_len_diff = max(k.shape[-2] - q.shape[-2], 0) + + dq = torch.zeros_like(q) + dk = torch.zeros_like(k) + dv = torch.zeros_like(v) + + row_splits = zip( + q.split(q_bucket_size, dim=-2), + o.split(q_bucket_size, dim=-2), + do.split(q_bucket_size, dim=-2), + mask, + l.split(q_bucket_size, dim=-2), + m.split(q_bucket_size, dim=-2), + dq.split(q_bucket_size, dim=-2) + ) + + for ind, (qc, oc, doc, row_mask, lc, mc, dqc) in enumerate(row_splits): + q_start_index = ind * q_bucket_size - qk_len_diff + + col_splits = zip( + k.split(k_bucket_size, dim=-2), + v.split(k_bucket_size, dim=-2), + dk.split(k_bucket_size, dim=-2), + dv.split(k_bucket_size, dim=-2), + ) + + for k_ind, (kc, vc, dkc, dvc) in enumerate(col_splits): + k_start_index = k_ind * k_bucket_size + + attn_weights = einsum('... i d, ... j d -> ... i j', qc, kc) * scale + + if causal and q_start_index < (k_start_index + k_bucket_size - 1): + causal_mask = torch.ones((qc.shape[-2], kc.shape[-2]), dtype=torch.bool, + device=device).triu(q_start_index - k_start_index + 1) + attn_weights.masked_fill_(causal_mask, max_neg_value) + + exp_attn_weights = torch.exp(attn_weights - mc) + + if exists(row_mask): + exp_attn_weights.masked_fill_(~row_mask, 0.) + + p = exp_attn_weights / lc + + dv_chunk = einsum('... i j, ... i d -> ... j d', p, doc) + dp = einsum('... i d, ... j d -> ... i j', doc, vc) + + D = (doc * oc).sum(dim=-1, keepdims=True) + ds = p * scale * (dp - D) + + dq_chunk = einsum('... i j, ... j d -> ... i d', ds, kc) + dk_chunk = einsum('... i j, ... i d -> ... j d', ds, qc) + + dqc.add_(dq_chunk) + dkc.add_(dk_chunk) + dvc.add_(dv_chunk) + + return dq, dk, dv, None, None, None, None + + +def replace_unet_modules(unet: diffusers.models.unet_2d_condition.UNet2DConditionModel, mem_eff_attn, xformers): + if mem_eff_attn: + replace_unet_cross_attn_to_memory_efficient() + elif xformers: + replace_unet_cross_attn_to_xformers() + + +def replace_unet_cross_attn_to_memory_efficient(): + print("Replace CrossAttention.forward to use FlashAttention (not xformers)") + flash_func = FlashAttentionFunction + + def forward_flash_attn(self, x, context=None, mask=None): + q_bucket_size = 512 + k_bucket_size = 1024 + + h = self.heads + q = self.to_q(x) + + context = context if context is not None else x + context = context.to(x.dtype) + + if hasattr(self, 'hypernetwork') and self.hypernetwork is not None: + context_k, context_v = self.hypernetwork.forward(x, context) + context_k = context_k.to(x.dtype) + context_v = context_v.to(x.dtype) + else: + context_k = context + context_v = context + + k = self.to_k(context_k) + v = self.to_v(context_v) + del context, x + + q, k, v = map(lambda t: rearrange(t, 'b n (h d) -> b h n d', h=h), (q, k, v)) + + out = flash_func.apply(q, k, v, mask, False, q_bucket_size, k_bucket_size) + + out = rearrange(out, 'b h n d -> b n (h d)') + + # diffusers 0.7.0~ わざわざ変えるなよ (;´Д`) + out = self.to_out[0](out) + out = self.to_out[1](out) + return out + + diffusers.models.attention.CrossAttention.forward = forward_flash_attn + + +def replace_unet_cross_attn_to_xformers(): + print("Replace CrossAttention.forward to use xformers") + try: + import xformers.ops + except ImportError: + raise ImportError("No xformers / xformersがインストールされていないようです") + + def forward_xformers(self, x, context=None, mask=None): + h = self.heads + q_in = self.to_q(x) + + context = default(context, x) + context = context.to(x.dtype) + + if hasattr(self, 'hypernetwork') and self.hypernetwork is not None: + context_k, context_v = self.hypernetwork.forward(x, context) + context_k = context_k.to(x.dtype) + context_v = context_v.to(x.dtype) + else: + context_k = context + context_v = context + + k_in = self.to_k(context_k) + v_in = self.to_v(context_v) + + q, k, v = map(lambda t: rearrange(t, 'b n (h d) -> b n h d', h=h), (q_in, k_in, v_in)) + del q_in, k_in, v_in + + q = q.contiguous() + k = k.contiguous() + v = v.contiguous() + out = xformers.ops.memory_efficient_attention(q, k, v, attn_bias=None) # 最適なのを選んでくれる + + out = rearrange(out, 'b n h d -> b n (h d)', h=h) + + # diffusers 0.7.0~ + out = self.to_out[0](out) + out = self.to_out[1](out) + return out + + diffusers.models.attention.CrossAttention.forward = forward_xformers +# endregion + + +if __name__ == '__main__': + # torch.cuda.set_per_process_memory_fraction(0.48) + parser = argparse.ArgumentParser() + parser.add_argument("--v2", action='store_true', + help='load Stable Diffusion v2.0 model / Stable Diffusion 2.0のモデルを読み込む') + parser.add_argument("--v_parameterization", action='store_true', + help='enable v-parameterization training / v-parameterization学習を有効にする') + parser.add_argument("--pretrained_model_name_or_path", type=str, default=None, + help="pretrained model to train, directory to Diffusers model or StableDiffusion checkpoint / 学習元モデル、Diffusers形式モデルのディレクトリまたはStableDiffusionのckptファイル") + parser.add_argument("--in_json", type=str, default=None, help="metadata file to input / 読みこむメタデータファイル") + parser.add_argument("--shuffle_caption", action="store_true", + help="shuffle comma-separated caption when fine tuning / fine tuning時にコンマで区切られたcaptionの各要素をshuffleする") + parser.add_argument("--keep_tokens", type=int, default=None, + help="keep heading N tokens when shuffling caption tokens / captionのシャッフル時に、先頭からこの個数のトークンをシャッフルしないで残す") + parser.add_argument("--train_data_dir", type=str, default=None, help="directory for train images / 学習画像データのディレクトリ") + parser.add_argument("--dataset_repeats", type=int, default=None, help="num times to repeat dataset / 学習にデータセットを繰り返す回数") + parser.add_argument("--output_dir", type=str, default=None, + help="directory to output trained model, save as same format as input / 学習後のモデル出力先ディレクトリ(入力と同じ形式で保存)") + parser.add_argument("--save_precision", type=str, default=None, + choices=[None, "float", "fp16", "bf16"], help="precision in saving (available in StableDiffusion checkpoint) / 保存時に精度を変更して保存する(StableDiffusion形式での保存時のみ有効)") + parser.add_argument("--save_model_as", type=str, default=None, choices=[None, "ckpt", "safetensors", "diffusers", "diffusers_safetensors"], + help="format to save the model (default is same to original) / モデル保存時の形式(未指定時は元モデルと同じ)") + parser.add_argument("--use_safetensors", action='store_true', + help="use safetensors format to save (if save_model_as is not specified) / checkpoint、モデルをsafetensors形式で保存する(save_model_as未指定時)") + parser.add_argument("--train_text_encoder", action="store_true", help="train text encoder / text encoderも学習する") + parser.add_argument("--hypernetwork_module", type=str, default=None, + help='train hypernetwork instead of fine tuning, module to use / fine tuningの代わりにHypernetworkの学習をする場合、そのモジュール') + parser.add_argument("--hypernetwork_weights", type=str, default=None, + help='hypernetwork weights to initialize for additional training / Hypernetworkの学習時に読み込む重み(Hypernetworkの追加学習)') + parser.add_argument("--save_every_n_epochs", type=int, default=None, + help="save checkpoint every N epochs / 学習中のモデルを指定エポックごとに保存する") + parser.add_argument("--save_state", action="store_true", + help="save training state additionally (including optimizer states etc.) / optimizerなど学習状態も含めたstateを追加で保存する") + parser.add_argument("--resume", type=str, default=None, + help="saved state to resume training / 学習再開するモデルのstate") + parser.add_argument("--max_token_length", type=int, default=None, choices=[None, 150, 225], + help="max token length of text encoder (default for 75, 150 or 225) / text encoderのトークンの最大長(未指定で75、150または225が指定可)") + parser.add_argument("--train_batch_size", type=int, default=1, + help="batch size for training / 学習時のバッチサイズ") + parser.add_argument("--use_8bit_adam", action="store_true", + help="use 8bit Adam optimizer (requires bitsandbytes) / 8bit Adamオプティマイザを使う(bitsandbytesのインストールが必要)") + parser.add_argument("--mem_eff_attn", action="store_true", + help="use memory efficient attention for CrossAttention / CrossAttentionに省メモリ版attentionを使う") + parser.add_argument("--xformers", action="store_true", + help="use xformers for CrossAttention / CrossAttentionにxformersを使う") + parser.add_argument("--diffusers_xformers", action='store_true', + help='use xformers by diffusers (Hypernetworks doesn\'t work) / Diffusersでxformersを使用する(Hypernetwork利用不可)') + parser.add_argument("--learning_rate", type=float, default=2.0e-6, help="learning rate / 学習率") + parser.add_argument("--max_train_steps", type=int, default=1600, help="training steps / 学習ステップ数") + parser.add_argument("--seed", type=int, default=None, help="random seed for training / 学習時の乱数のseed") + parser.add_argument("--gradient_checkpointing", action="store_true", + help="enable gradient checkpointing / grandient checkpointingを有効にする") + parser.add_argument("--gradient_accumulation_steps", type=int, default=1, + help="Number of updates steps to accumulate before performing a backward/update pass / 学習時に逆伝播をする前に勾配を合計するステップ数") + parser.add_argument("--mixed_precision", type=str, default="no", + choices=["no", "fp16", "bf16"], help="use mixed precision / 混合精度を使う場合、その精度") + parser.add_argument("--full_fp16", action="store_true", help="fp16 training including gradients / 勾配も含めてfp16で学習する") + parser.add_argument("--clip_skip", type=int, default=None, + help="use output of nth layer from back of text encoder (n>=1) / text encoderの後ろからn番目の層の出力を用いる(nは1以上)") + parser.add_argument("--debug_dataset", action="store_true", + help="show images for debugging (do not train) / デバッグ用に学習データを画面表示する(学習は行わない)") + parser.add_argument("--logging_dir", type=str, default=None, + help="enable logging and output TensorBoard log to this directory / ログ出力を有効にしてこのディレクトリにTensorBoard用のログを出力する") + parser.add_argument("--log_prefix", type=str, default=None, help="add prefix for each log directory / ログディレクトリ名の先頭に追加する文字列") + parser.add_argument("--lr_scheduler", type=str, default="constant", + help="scheduler to use for learning rate / 学習率のスケジューラ: linear, cosine, cosine_with_restarts, polynomial, constant (default), constant_with_warmup") + parser.add_argument("--lr_warmup_steps", type=int, default=0, + help="Number of steps for the warmup in the lr scheduler (default is 0) / 学習率のスケジューラをウォームアップするステップ数(デフォルト0)") + + args = parser.parse_args() + train(args) diff --git a/finetune.bat b/finetune.bat new file mode 100644 index 0000000..6067935 --- /dev/null +++ b/finetune.bat @@ -0,0 +1 @@ +.\venv\Scripts\python.exe .\finetune_gui.py \ No newline at end of file diff --git a/BLIP_caption/models/blip.py b/finetune/blip/blip.py similarity index 97% rename from BLIP_caption/models/blip.py rename to finetune/blip/blip.py index 38678f6..7851fb0 100644 --- a/BLIP_caption/models/blip.py +++ b/finetune/blip/blip.py @@ -8,8 +8,10 @@ import warnings warnings.filterwarnings("ignore") -from models.vit import VisionTransformer, interpolate_pos_embed -from models.med import BertConfig, BertModel, BertLMHeadModel +# from models.vit import VisionTransformer, interpolate_pos_embed +# from models.med import BertConfig, BertModel, BertLMHeadModel +from blip.vit import VisionTransformer, interpolate_pos_embed +from blip.med import BertConfig, BertModel, BertLMHeadModel from transformers import BertTokenizer import torch diff --git a/BLIP_caption/models/med.py b/finetune/blip/med.py similarity index 99% rename from BLIP_caption/models/med.py rename to finetune/blip/med.py index 572d39d..7b00a35 100644 --- a/BLIP_caption/models/med.py +++ b/finetune/blip/med.py @@ -929,7 +929,7 @@ class BertLMHeadModel(BertPreTrainedModel): cross_attentions=outputs.cross_attentions, ) - def prepare_inputs_for_generation(self, input_ids, past=None, attention_mask=None, encoder_hidden_states=None, encoder_attention_mask=None, **model_kwargs): + def prepare_inputs_for_generation(self, input_ids, past=None, attention_mask=None, **model_kwargs): input_shape = input_ids.shape # if model is used as a decoder in encoder-decoder model, the decoder attention mask is created on the fly if attention_mask is None: @@ -943,8 +943,8 @@ class BertLMHeadModel(BertPreTrainedModel): "input_ids": input_ids, "attention_mask": attention_mask, "past_key_values": past, - "encoder_hidden_states": encoder_hidden_states, - "encoder_attention_mask": encoder_attention_mask, + "encoder_hidden_states": model_kwargs.get("encoder_hidden_states", None), + "encoder_attention_mask": model_kwargs.get("encoder_attention_mask", None), "is_decoder": True, } diff --git a/finetune/blip/med_config.json b/finetune/blip/med_config.json new file mode 100644 index 0000000..dc12b99 --- /dev/null +++ b/finetune/blip/med_config.json @@ -0,0 +1,22 @@ +{ + "architectures": [ + "BertModel" + ], + "attention_probs_dropout_prob": 0.1, + "hidden_act": "gelu", + "hidden_dropout_prob": 0.1, + "hidden_size": 768, + "initializer_range": 0.02, + "intermediate_size": 3072, + "layer_norm_eps": 1e-12, + "max_position_embeddings": 512, + "model_type": "bert", + "num_attention_heads": 12, + "num_hidden_layers": 12, + "pad_token_id": 0, + "type_vocab_size": 2, + "vocab_size": 30524, + "encoder_width": 768, + "add_cross_attention": true + } + \ No newline at end of file diff --git a/BLIP_caption/models/vit.py b/finetune/blip/vit.py similarity index 100% rename from BLIP_caption/models/vit.py rename to finetune/blip/vit.py diff --git a/finetune/clean_captions_and_tags.py b/finetune/clean_captions_and_tags.py new file mode 100644 index 0000000..8f53737 --- /dev/null +++ b/finetune/clean_captions_and_tags.py @@ -0,0 +1,123 @@ +# このスクリプトのライセンスは、Apache License 2.0とします +# (c) 2022 Kohya S. @kohya_ss + +import argparse +import glob +import os +import json + +from tqdm import tqdm + + +def clean_tags(image_key, tags): + # replace '_' to ' ' + tags = tags.replace('_', ' ') + + # remove rating: deepdanbooruのみ + tokens = tags.split(", rating") + if len(tokens) == 1: + # WD14 taggerのときはこちらになるのでメッセージは出さない + # print("no rating:") + # print(f"{image_key} {tags}") + pass + else: + if len(tokens) > 2: + print("multiple ratings:") + print(f"{image_key} {tags}") + tags = tokens[0] + + return tags + + +# 上から順に検索、置換される +# ('置換元文字列', '置換後文字列') +CAPTION_REPLACEMENTS = [ + ('anime anime', 'anime'), + ('young ', ''), + ('anime girl', 'girl'), + ('cartoon female', 'girl'), + ('cartoon lady', 'girl'), + ('cartoon character', 'girl'), # a or ~s + ('cartoon woman', 'girl'), + ('cartoon women', 'girls'), + ('cartoon girl', 'girl'), + ('anime female', 'girl'), + ('anime lady', 'girl'), + ('anime character', 'girl'), # a or ~s + ('anime woman', 'girl'), + ('anime women', 'girls'), + ('lady', 'girl'), + ('female', 'girl'), + ('woman', 'girl'), + ('women', 'girls'), + ('people', 'girls'), + ('person', 'girl'), + ('a cartoon figure', 'a figure'), + ('a cartoon image', 'an image'), + ('a cartoon picture', 'a picture'), + ('an anime cartoon image', 'an image'), + ('a cartoon anime drawing', 'a drawing'), + ('a cartoon drawing', 'a drawing'), + ('girl girl', 'girl'), +] + + +def clean_caption(caption): + for rf, rt in CAPTION_REPLACEMENTS: + replaced = True + while replaced: + bef = caption + caption = caption.replace(rf, rt) + replaced = bef != caption + return caption + + +def main(args): + if os.path.exists(args.in_json): + print(f"loading existing metadata: {args.in_json}") + with open(args.in_json, "rt", encoding='utf-8') as f: + metadata = json.load(f) + else: + print("no metadata / メタデータファイルがありません") + return + + print("cleaning captions and tags.") + image_keys = list(metadata.keys()) + for image_key in tqdm(image_keys): + tags = metadata[image_key].get('tags') + if tags is None: + print(f"image does not have tags / メタデータにタグがありません: {image_key}") + else: + metadata[image_key]['tags'] = clean_tags(image_key, tags) + + caption = metadata[image_key].get('caption') + if caption is None: + print(f"image does not have caption / メタデータにキャプションがありません: {image_key}") + else: + metadata[image_key]['caption'] = clean_caption(caption) + + # metadataを書き出して終わり + print(f"writing metadata: {args.out_json}") + with open(args.out_json, "wt", encoding='utf-8') as f: + json.dump(metadata, f, indent=2) + print("done!") + + +if __name__ == '__main__': + parser = argparse.ArgumentParser() + # parser.add_argument("train_data_dir", type=str, help="directory for train images / 学習画像データのディレクトリ") + parser.add_argument("in_json", type=str, help="metadata file to input / 読み込むメタデータファイル") + parser.add_argument("out_json", type=str, help="metadata file to output / メタデータファイル書き出し先") + + args, unknown = parser.parse_known_args() + if len(unknown) == 1: + print("WARNING: train_data_dir argument is removed. This script will not work with three arguments in future. Please specify two arguments: in_json and out_json.") + print("All captions and tags in the metadata are processed.") + print("警告: train_data_dir引数は不要になりました。将来的には三つの引数を指定すると動かなくなる予定です。読み込み元のメタデータと書き出し先の二つの引数だけ指定してください。") + print("メタデータ内のすべてのキャプションとタグが処理されます。") + args.in_json = args.out_json + args.out_json = unknown[0] + elif len(unknown) > 0: + raise ValueError(f"error: unrecognized arguments: {unknown}") + + main(args) diff --git a/finetune/hypernetwork_nai.py b/finetune/hypernetwork_nai.py new file mode 100644 index 0000000..dcaaa71 --- /dev/null +++ b/finetune/hypernetwork_nai.py @@ -0,0 +1,96 @@ +# NAI compatible + +import torch + + +class HypernetworkModule(torch.nn.Module): + def __init__(self, dim, multiplier=1.0): + super().__init__() + + linear1 = torch.nn.Linear(dim, dim * 2) + linear2 = torch.nn.Linear(dim * 2, dim) + linear1.weight.data.normal_(mean=0.0, std=0.01) + linear1.bias.data.zero_() + linear2.weight.data.normal_(mean=0.0, std=0.01) + linear2.bias.data.zero_() + linears = [linear1, linear2] + + self.linear = torch.nn.Sequential(*linears) + self.multiplier = multiplier + + def forward(self, x): + return x + self.linear(x) * self.multiplier + + +class Hypernetwork(torch.nn.Module): + enable_sizes = [320, 640, 768, 1280] + # return self.modules[Hypernetwork.enable_sizes.index(size)] + + def __init__(self, multiplier=1.0) -> None: + super().__init__() + self.modules = [] + for size in Hypernetwork.enable_sizes: + self.modules.append((HypernetworkModule(size, multiplier), HypernetworkModule(size, multiplier))) + self.register_module(f"{size}_0", self.modules[-1][0]) + self.register_module(f"{size}_1", self.modules[-1][1]) + + def apply_to_stable_diffusion(self, text_encoder, vae, unet): + blocks = unet.input_blocks + [unet.middle_block] + unet.output_blocks + for block in blocks: + for subblk in block: + if 'SpatialTransformer' in str(type(subblk)): + for tf_block in subblk.transformer_blocks: + for attn in [tf_block.attn1, tf_block.attn2]: + size = attn.context_dim + if size in Hypernetwork.enable_sizes: + attn.hypernetwork = self + else: + attn.hypernetwork = None + + def apply_to_diffusers(self, text_encoder, vae, unet): + blocks = unet.down_blocks + [unet.mid_block] + unet.up_blocks + for block in blocks: + if hasattr(block, 'attentions'): + for subblk in block.attentions: + if 'SpatialTransformer' in str(type(subblk)) or 'Transformer2DModel' in str(type(subblk)): # 0.6.0 and 0.7~ + for tf_block in subblk.transformer_blocks: + for attn in [tf_block.attn1, tf_block.attn2]: + size = attn.to_k.in_features + if size in Hypernetwork.enable_sizes: + attn.hypernetwork = self + else: + attn.hypernetwork = None + return True # TODO error checking + + def forward(self, x, context): + size = context.shape[-1] + assert size in Hypernetwork.enable_sizes + module = self.modules[Hypernetwork.enable_sizes.index(size)] + return module[0].forward(context), module[1].forward(context) + + def load_from_state_dict(self, state_dict): + # old ver to new ver + changes = { + 'linear1.bias': 'linear.0.bias', + 'linear1.weight': 'linear.0.weight', + 'linear2.bias': 'linear.1.bias', + 'linear2.weight': 'linear.1.weight', + } + for key_from, key_to in changes.items(): + if key_from in state_dict: + state_dict[key_to] = state_dict[key_from] + del state_dict[key_from] + + for size, sd in state_dict.items(): + if type(size) == int: + self.modules[Hypernetwork.enable_sizes.index(size)][0].load_state_dict(sd[0], strict=True) + self.modules[Hypernetwork.enable_sizes.index(size)][1].load_state_dict(sd[1], strict=True) + return True + + def get_state_dict(self): + state_dict = {} + for i, size in enumerate(Hypernetwork.enable_sizes): + sd0 = self.modules[i][0].state_dict() + sd1 = self.modules[i][1].state_dict() + state_dict[size] = [sd0, sd1] + return state_dict diff --git a/BLIP_caption/make_captions.py b/finetune/make_captions.py similarity index 76% rename from BLIP_caption/make_captions.py rename to finetune/make_captions.py index 59272ff..5808051 100644 --- a/BLIP_caption/make_captions.py +++ b/finetune/make_captions.py @@ -1,10 +1,8 @@ -# このスクリプトのライセンスは、Apache License 2.0とします -# (c) 2022 Kohya S. @kohya_ss - import argparse import glob import os import json +import random from PIL import Image from tqdm import tqdm @@ -12,51 +10,45 @@ import numpy as np import torch from torchvision import transforms from torchvision.transforms.functional import InterpolationMode -from models.blip import blip_decoder +from blip.blip import blip_decoder # from Salesforce_BLIP.models.blip import blip_decoder -device = torch.device('cuda' if torch.cuda.is_available() else 'cpu') +DEVICE = torch.device('cuda' if torch.cuda.is_available() else 'cpu') def main(args): - cwd = os.getcwd() - print('Current Working Directory is: ', cwd) + # fix the seed for reproducibility + seed = args.seed # + utils.get_rank() + torch.manual_seed(seed) + np.random.seed(seed) + random.seed(seed) + + if not os.path.exists("blip"): + cwd = os.getcwd() + print('Current Working Directory is: ', cwd) + os.chdir('finetune') - os.chdir('.\BLIP_caption') - image_paths = glob.glob(os.path.join(args.train_data_dir, "*.jpg")) + \ glob.glob(os.path.join(args.train_data_dir, "*.png")) + glob.glob(os.path.join(args.train_data_dir, "*.webp")) print(f"found {len(image_paths)} images.") print(f"loading BLIP caption: {args.caption_weights}") - # image_size = 384 - # model = blip_decoder(pretrained=args.caption_weights, image_size=image_size, vit='large', med_config='configs/med_config.json') - # model.eval() - # model = model.to(device) - image_size = 384 + model = blip_decoder(pretrained=args.caption_weights, image_size=image_size, vit='large', med_config="./blip/med_config.json") + model.eval() + model = model.to(DEVICE) + print("BLIP loaded") + + # 正方形でいいのか? という気がするがソースがそうなので transform = transforms.Compose([ - transforms.Resize((image_size,image_size),interpolation=InterpolationMode.BICUBIC), + transforms.Resize((image_size, image_size), interpolation=InterpolationMode.BICUBIC), transforms.ToTensor(), transforms.Normalize((0.48145466, 0.4578275, 0.40821073), (0.26862954, 0.26130258, 0.27577711)) - ]) - - model_url = args.caption_weights # 'https://storage.googleapis.com/sfr-vision-language-research/BLIP/models/model_large_caption.pth' - - model = blip_decoder(pretrained=model_url, image_size=384, vit='large') - model.eval() - model = model.to(device) - print("BLIP loaded") - # 正方形でいいのか? という気がするがソースがそうなので - # transform = transforms.Compose([ - # transforms.Resize((image_size, image_size), interpolation=InterpolationMode.BICUBIC), - # transforms.ToTensor(), - # transforms.Normalize((0.48145466, 0.4578275, 0.40821073), (0.26862954, 0.26130258, 0.27577711)) - # ]) + ]) # captioningする def run_batch(path_imgs): - imgs = torch.stack([im for _, im in path_imgs]).to(device) + imgs = torch.stack([im for _, im in path_imgs]).to(DEVICE) with torch.no_grad(): if args.beam_search: @@ -92,7 +84,7 @@ def main(args): if __name__ == '__main__': parser = argparse.ArgumentParser() parser.add_argument("train_data_dir", type=str, help="directory for train images / 学習画像データのディレクトリ") - parser.add_argument("caption_weights", type=str, + parser.add_argument("--caption_weights", type=str, default="https://storage.googleapis.com/sfr-vision-language-research/BLIP/models/model_large_caption.pth", help="BLIP caption weights (model_large_caption.pth) / BLIP captionの重みファイル(model_large_caption.pth)") parser.add_argument("--caption_extention", type=str, default=None, help="extension of caption file (for backward compatibility) / 出力されるキャプションファイルの拡張子(スペルミスしていたのを残してあります)") @@ -104,6 +96,7 @@ if __name__ == '__main__': parser.add_argument("--top_p", type=float, default=0.9, help="top_p in Nucleus sampling / Nucleus sampling時のtop_p") parser.add_argument("--max_length", type=int, default=75, help="max length of caption / captionの最大長") parser.add_argument("--min_length", type=int, default=5, help="min length of caption / captionの最小長") + parser.add_argument('--seed', default=42, type=int, help='seed for reproducibility / 再現性を確保するための乱数seed') parser.add_argument("--debug", action="store_true", help="debug mode") args = parser.parse_args() diff --git a/finetune/merge_captions_to_metadata.py b/finetune/merge_captions_to_metadata.py new file mode 100644 index 0000000..2da6356 --- /dev/null +++ b/finetune/merge_captions_to_metadata.py @@ -0,0 +1,68 @@ +# このスクリプトのライセンスは、Apache License 2.0とします +# (c) 2022 Kohya S. @kohya_ss + +import argparse +import glob +import os +import json + +from tqdm import tqdm + + +def main(args): + image_paths = glob.glob(os.path.join(args.train_data_dir, "*.jpg")) + \ + glob.glob(os.path.join(args.train_data_dir, "*.png")) + glob.glob(os.path.join(args.train_data_dir, "*.webp")) + print(f"found {len(image_paths)} images.") + + if args.in_json is None and os.path.isfile(args.out_json): + args.in_json = args.out_json + + if args.in_json is not None: + print(f"loading existing metadata: {args.in_json}") + with open(args.in_json, "rt", encoding='utf-8') as f: + metadata = json.load(f) + print("captions for existing images will be overwritten / 既存の画像のキャプションは上書きされます") + else: + print("new metadata will be created / 新しいメタデータファイルが作成されます") + metadata = {} + + print("merge caption texts to metadata json.") + for image_path in tqdm(image_paths): + caption_path = os.path.splitext(image_path)[0] + args.caption_extension + with open(caption_path, "rt", encoding='utf-8') as f: + caption = f.readlines()[0].strip() + + image_key = image_path if args.full_path else os.path.splitext(os.path.basename(image_path))[0] + if image_key not in metadata: + metadata[image_key] = {} + + metadata[image_key]['caption'] = caption + if args.debug: + print(image_key, caption) + + # metadataを書き出して終わり + print(f"writing metadata: {args.out_json}") + with open(args.out_json, "wt", encoding='utf-8') as f: + json.dump(metadata, f, indent=2) + print("done!") + + +if __name__ == '__main__': + parser = argparse.ArgumentParser() + parser.add_argument("train_data_dir", type=str, help="directory for train images / 学習画像データのディレクトリ") + parser.add_argument("out_json", type=str, help="metadata file to output / メタデータファイル書き出し先") + parser.add_argument("--in_json", type=str, help="metadata file to input (if omitted and out_json exists, existing out_json is read) / 読み込むメタデータファイル(省略時、out_jsonが存在すればそれを読み込む)") + parser.add_argument("--caption_extention", type=str, default=None, + help="extension of caption file (for backward compatibility) / 読み込むキャプションファイルの拡張子(スペルミスしていたのを残してあります)") + parser.add_argument("--caption_extension", type=str, default=".caption", help="extension of caption file / 読み込むキャプションファイルの拡張子") + parser.add_argument("--full_path", action="store_true", + help="use full path as image-key in metadata (supports multiple directories) / メタデータで画像キーをフルパスにする(複数の学習画像ディレクトリに対応)") + parser.add_argument("--debug", action="store_true", help="debug mode") + + args = parser.parse_args() + + # スペルミスしていたオプションを復元する + if args.caption_extention is not None: + args.caption_extension = args.caption_extention + + main(args) diff --git a/finetune/merge_dd_tags_to_metadata.py b/finetune/merge_dd_tags_to_metadata.py new file mode 100644 index 0000000..8101ecd --- /dev/null +++ b/finetune/merge_dd_tags_to_metadata.py @@ -0,0 +1,60 @@ +# このスクリプトのライセンスは、Apache License 2.0とします +# (c) 2022 Kohya S. @kohya_ss + +import argparse +import glob +import os +import json + +from tqdm import tqdm + + +def main(args): + image_paths = glob.glob(os.path.join(args.train_data_dir, "*.jpg")) + \ + glob.glob(os.path.join(args.train_data_dir, "*.png")) + glob.glob(os.path.join(args.train_data_dir, "*.webp")) + print(f"found {len(image_paths)} images.") + + if args.in_json is None and os.path.isfile(args.out_json): + args.in_json = args.out_json + + if args.in_json is not None: + print(f"loading existing metadata: {args.in_json}") + with open(args.in_json, "rt", encoding='utf-8') as f: + metadata = json.load(f) + print("tags data for existing images will be overwritten / 既存の画像のタグは上書きされます") + else: + print("new metadata will be created / 新しいメタデータファイルが作成されます") + metadata = {} + + print("merge tags to metadata json.") + for image_path in tqdm(image_paths): + tags_path = os.path.splitext(image_path)[0] + '.txt' + with open(tags_path, "rt", encoding='utf-8') as f: + tags = f.readlines()[0].strip() + + image_key = image_path if args.full_path else os.path.splitext(os.path.basename(image_path))[0] + if image_key not in metadata: + metadata[image_key] = {} + + metadata[image_key]['tags'] = tags + if args.debug: + print(image_key, tags) + + # metadataを書き出して終わり + print(f"writing metadata: {args.out_json}") + with open(args.out_json, "wt", encoding='utf-8') as f: + json.dump(metadata, f, indent=2) + print("done!") + + +if __name__ == '__main__': + parser = argparse.ArgumentParser() + parser.add_argument("train_data_dir", type=str, help="directory for train images / 学習画像データのディレクトリ") + parser.add_argument("out_json", type=str, help="metadata file to output / メタデータファイル書き出し先") + parser.add_argument("--in_json", type=str, help="metadata file to input (if omitted and out_json exists, existing out_json is read) / 読み込むメタデータファイル(省略時、out_jsonが存在すればそれを読み込む)") + parser.add_argument("--full_path", action="store_true", + help="use full path as image-key in metadata (supports multiple directories) / メタデータで画像キーをフルパスにする(複数の学習画像ディレクトリに対応)") + parser.add_argument("--debug", action="store_true", help="debug mode, print tags") + + args = parser.parse_args() + main(args) diff --git a/finetune/prepare_buckets_latents.py b/finetune/prepare_buckets_latents.py new file mode 100644 index 0000000..e2cebe8 --- /dev/null +++ b/finetune/prepare_buckets_latents.py @@ -0,0 +1,177 @@ +# このスクリプトのライセンスは、Apache License 2.0とします +# (c) 2022 Kohya S. @kohya_ss + +import argparse +import glob +import os +import json + +from tqdm import tqdm +import numpy as np +from diffusers import AutoencoderKL +from PIL import Image +import cv2 +import torch +from torchvision import transforms + +import library.model_util as model_util + +DEVICE = torch.device('cuda' if torch.cuda.is_available() else 'cpu') + +IMAGE_TRANSFORMS = transforms.Compose( + [ + transforms.ToTensor(), + transforms.Normalize([0.5], [0.5]), + ] +) + + +def get_latents(vae, images, weight_dtype): + img_tensors = [IMAGE_TRANSFORMS(image) for image in images] + img_tensors = torch.stack(img_tensors) + img_tensors = img_tensors.to(DEVICE, weight_dtype) + with torch.no_grad(): + latents = vae.encode(img_tensors).latent_dist.sample().float().to("cpu").numpy() + return latents + + +def main(args): + image_paths = glob.glob(os.path.join(args.train_data_dir, "*.jpg")) + \ + glob.glob(os.path.join(args.train_data_dir, "*.png")) + glob.glob(os.path.join(args.train_data_dir, "*.webp")) + print(f"found {len(image_paths)} images.") + + if os.path.exists(args.in_json): + print(f"loading existing metadata: {args.in_json}") + with open(args.in_json, "rt", encoding='utf-8') as f: + metadata = json.load(f) + else: + print(f"no metadata / メタデータファイルがありません: {args.in_json}") + return + + weight_dtype = torch.float32 + if args.mixed_precision == "fp16": + weight_dtype = torch.float16 + elif args.mixed_precision == "bf16": + weight_dtype = torch.bfloat16 + + vae = model_util.load_vae(args.model_name_or_path, weight_dtype) + vae.eval() + vae.to(DEVICE, dtype=weight_dtype) + + # bucketのサイズを計算する + max_reso = tuple([int(t) for t in args.max_resolution.split(',')]) + assert len(max_reso) == 2, f"illegal resolution (not 'width,height') / 画像サイズに誤りがあります。'幅,高さ'で指定してください: {args.max_resolution}" + + bucket_resos, bucket_aspect_ratios = model_util.make_bucket_resolutions( + max_reso, args.min_bucket_reso, args.max_bucket_reso) + + # 画像をひとつずつ適切なbucketに割り当てながらlatentを計算する + bucket_aspect_ratios = np.array(bucket_aspect_ratios) + buckets_imgs = [[] for _ in range(len(bucket_resos))] + bucket_counts = [0 for _ in range(len(bucket_resos))] + img_ar_errors = [] + for i, image_path in enumerate(tqdm(image_paths, smoothing=0.0)): + image_key = image_path if args.full_path else os.path.splitext(os.path.basename(image_path))[0] + if image_key not in metadata: + metadata[image_key] = {} + + image = Image.open(image_path) + if image.mode != 'RGB': + image = image.convert("RGB") + + aspect_ratio = image.width / image.height + ar_errors = bucket_aspect_ratios - aspect_ratio + bucket_id = np.abs(ar_errors).argmin() + reso = bucket_resos[bucket_id] + ar_error = ar_errors[bucket_id] + img_ar_errors.append(abs(ar_error)) + + # どのサイズにリサイズするか→トリミングする方向で + if ar_error <= 0: # 横が長い→縦を合わせる + scale = reso[1] / image.height + else: + scale = reso[0] / image.width + + resized_size = (int(image.width * scale + .5), int(image.height * scale + .5)) + + # print(image.width, image.height, bucket_id, bucket_resos[bucket_id], ar_errors[bucket_id], resized_size, + # bucket_resos[bucket_id][0] - resized_size[0], bucket_resos[bucket_id][1] - resized_size[1]) + + assert resized_size[0] == reso[0] or resized_size[1] == reso[ + 1], f"internal error, resized size not match: {reso}, {resized_size}, {image.width}, {image.height}" + assert resized_size[0] >= reso[0] and resized_size[1] >= reso[ + 1], f"internal error, resized size too small: {reso}, {resized_size}, {image.width}, {image.height}" + + # 画像をリサイズしてトリミングする + # PILにinter_areaがないのでcv2で…… + image = np.array(image) + image = cv2.resize(image, resized_size, interpolation=cv2.INTER_AREA) + if resized_size[0] > reso[0]: + trim_size = resized_size[0] - reso[0] + image = image[:, trim_size//2:trim_size//2 + reso[0]] + elif resized_size[1] > reso[1]: + trim_size = resized_size[1] - reso[1] + image = image[trim_size//2:trim_size//2 + reso[1]] + assert image.shape[0] == reso[1] and image.shape[1] == reso[0], f"internal error, illegal trimmed size: {image.shape}, {reso}" + + # # debug + # cv2.imwrite(f"r:\\test\\img_{i:05d}.jpg", image[:, :, ::-1]) + + # バッチへ追加 + buckets_imgs[bucket_id].append((image_key, reso, image)) + bucket_counts[bucket_id] += 1 + metadata[image_key]['train_resolution'] = reso + + # バッチを推論するか判定して推論する + is_last = i == len(image_paths) - 1 + for j in range(len(buckets_imgs)): + bucket = buckets_imgs[j] + if (is_last and len(bucket) > 0) or len(bucket) >= args.batch_size: + latents = get_latents(vae, [img for _, _, img in bucket], weight_dtype) + + for (image_key, reso, _), latent in zip(bucket, latents): + np.savez(os.path.join(args.train_data_dir, os.path.splitext(os.path.basename(image_key))[0]), latent) + + # flip + if args.flip_aug: + latents = get_latents(vae, [img[:, ::-1].copy() for _, _, img in bucket], weight_dtype) # copyがないとTensor変換できない + + for (image_key, reso, _), latent in zip(bucket, latents): + np.savez(os.path.join(args.train_data_dir, os.path.splitext(os.path.basename(image_key))[0] + '_flip'), latent) + + bucket.clear() + + for i, (reso, count) in enumerate(zip(bucket_resos, bucket_counts)): + print(f"bucket {i} {reso}: {count}") + img_ar_errors = np.array(img_ar_errors) + print(f"mean ar error: {np.mean(img_ar_errors)}") + + # metadataを書き出して終わり + print(f"writing metadata: {args.out_json}") + with open(args.out_json, "wt", encoding='utf-8') as f: + json.dump(metadata, f, indent=2) + print("done!") + + +if __name__ == '__main__': + parser = argparse.ArgumentParser() + parser.add_argument("train_data_dir", type=str, help="directory for train images / 学習画像データのディレクトリ") + parser.add_argument("in_json", type=str, help="metadata file to input / 読み込むメタデータファイル") + parser.add_argument("out_json", type=str, help="metadata file to output / メタデータファイル書き出し先") + parser.add_argument("model_name_or_path", type=str, help="model name or path to encode latents / latentを取得するためのモデル") + parser.add_argument("--v2", action='store_true', + help='load Stable Diffusion v2.0 model / Stable Diffusion 2.0のモデルを読み込む') + parser.add_argument("--batch_size", type=int, default=1, help="batch size in inference / 推論時のバッチサイズ") + parser.add_argument("--max_resolution", type=str, default="512,512", + help="max resolution in fine tuning (width,height) / fine tuning時の最大画像サイズ 「幅,高さ」(使用メモリ量に関係します)") + parser.add_argument("--min_bucket_reso", type=int, default=256, help="minimum resolution for buckets / bucketの最小解像度") + parser.add_argument("--max_bucket_reso", type=int, default=1024, help="maximum resolution for buckets / bucketの最小解像度") + parser.add_argument("--mixed_precision", type=str, default="no", + choices=["no", "fp16", "bf16"], help="use mixed precision / 混合精度を使う場合、その精度") + parser.add_argument("--full_path", action="store_true", + help="use full path as image-key in metadata (supports multiple directories) / メタデータで画像キーをフルパスにする(複数の学習画像ディレクトリに対応)") + parser.add_argument("--flip_aug", action="store_true", + help="flip augmentation, save latents for flipped images / 左右反転した画像もlatentを取得、保存する") + + args = parser.parse_args() + main(args) diff --git a/finetune/tag_images_by_wd14_tagger.py b/finetune/tag_images_by_wd14_tagger.py new file mode 100644 index 0000000..c576789 --- /dev/null +++ b/finetune/tag_images_by_wd14_tagger.py @@ -0,0 +1,143 @@ +# このスクリプトのライセンスは、Apache License 2.0とします +# (c) 2022 Kohya S. @kohya_ss + +import argparse +import csv +import glob +import os + +from PIL import Image +import cv2 +from tqdm import tqdm +import numpy as np +from tensorflow.keras.models import load_model +from huggingface_hub import hf_hub_download + +# from wd14 tagger +IMAGE_SIZE = 448 + +WD14_TAGGER_REPO = 'SmilingWolf/wd-v1-4-vit-tagger' +FILES = ["keras_metadata.pb", "saved_model.pb", "selected_tags.csv"] +SUB_DIR = "variables" +SUB_DIR_FILES = ["variables.data-00000-of-00001", "variables.index"] +CSV_FILE = FILES[-1] + + +def main(args): + # hf_hub_downloadをそのまま使うとsymlink関係で問題があるらしいので、キャッシュディレクトリとforce_filenameを指定してなんとかする + # depreacatedの警告が出るけどなくなったらその時 + # https://github.com/toriato/stable-diffusion-webui-wd14-tagger/issues/22 + if not os.path.exists(args.model_dir) or args.force_download: + print("downloading wd14 tagger model from hf_hub") + for file in FILES: + hf_hub_download(args.repo_id, file, cache_dir=args.model_dir, force_download=True, force_filename=file) + for file in SUB_DIR_FILES: + hf_hub_download(args.repo_id, file, subfolder=SUB_DIR, cache_dir=os.path.join( + args.model_dir, SUB_DIR), force_download=True, force_filename=file) + + # 画像を読み込む + image_paths = glob.glob(os.path.join(args.train_data_dir, "*.jpg")) + \ + glob.glob(os.path.join(args.train_data_dir, "*.png")) + glob.glob(os.path.join(args.train_data_dir, "*.webp")) + print(f"found {len(image_paths)} images.") + + print("loading model and labels") + model = load_model(args.model_dir) + + # label_names = pd.read_csv("2022_0000_0899_6549/selected_tags.csv") + # 依存ライブラリを増やしたくないので自力で読むよ + with open(os.path.join(args.model_dir, CSV_FILE), "r", encoding="utf-8") as f: + reader = csv.reader(f) + l = [row for row in reader] + header = l[0] # tag_id,name,category,count + rows = l[1:] + assert header[0] == 'tag_id' and header[1] == 'name' and header[2] == 'category', f"unexpected csv format: {header}" + + tags = [row[1] for row in rows[1:] if row[2] == '0'] # categoryが0、つまり通常のタグのみ + + # 推論する + def run_batch(path_imgs): + imgs = np.array([im for _, im in path_imgs]) + + probs = model(imgs, training=False) + probs = probs.numpy() + + for (image_path, _), prob in zip(path_imgs, probs): + # 最初の4つはratingなので無視する + # # First 4 labels are actually ratings: pick one with argmax + # ratings_names = label_names[:4] + # rating_index = ratings_names["probs"].argmax() + # found_rating = ratings_names[rating_index: rating_index + 1][["name", "probs"]] + + # それ以降はタグなのでconfidenceがthresholdより高いものを追加する + # Everything else is tags: pick any where prediction confidence > threshold + tag_text = "" + for i, p in enumerate(prob[4:]): # numpyとか使うのが良いけど、まあそれほど数も多くないのでループで + if p >= args.thresh: + tag_text += ", " + tags[i] + + if len(tag_text) > 0: + tag_text = tag_text[2:] # 最初の ", " を消す + + with open(os.path.splitext(image_path)[0] + args.caption_extension, "wt", encoding='utf-8') as f: + f.write(tag_text + '\n') + if args.debug: + print(image_path, tag_text) + + b_imgs = [] + for image_path in tqdm(image_paths, smoothing=0.0): + img = Image.open(image_path) # cv2は日本語ファイル名で死ぬのとモード変換したいのでpillowで開く + if img.mode != 'RGB': + img = img.convert("RGB") + img = np.array(img) + img = img[:, :, ::-1] # RGB->BGR + + # pad to square + size = max(img.shape[0:2]) + pad_x = size - img.shape[1] + pad_y = size - img.shape[0] + pad_l = pad_x // 2 + pad_t = pad_y // 2 + img = np.pad(img, ((pad_t, pad_y - pad_t), (pad_l, pad_x - pad_l), (0, 0)), mode='constant', constant_values=255) + + interp = cv2.INTER_AREA if size > IMAGE_SIZE else cv2.INTER_LANCZOS4 + img = cv2.resize(img, (IMAGE_SIZE, IMAGE_SIZE), interpolation=interp) + # cv2.imshow("img", img) + # cv2.waitKey() + # cv2.destroyAllWindows() + + img = img.astype(np.float32) + b_imgs.append((image_path, img)) + + if len(b_imgs) >= args.batch_size: + run_batch(b_imgs) + b_imgs.clear() + + if len(b_imgs) > 0: + run_batch(b_imgs) + + print("done!") + + +if __name__ == '__main__': + parser = argparse.ArgumentParser() + parser.add_argument("train_data_dir", type=str, help="directory for train images / 学習画像データのディレクトリ") + parser.add_argument("--repo_id", type=str, default=WD14_TAGGER_REPO, + help="repo id for wd14 tagger on Hugging Face / Hugging Faceのwd14 taggerのリポジトリID") + parser.add_argument("--model_dir", type=str, default="wd14_tagger_model", + help="directory to store wd14 tagger model / wd14 taggerのモデルを格納するディレクトリ") + parser.add_argument("--force_download", action='store_true', + help="force downloading wd14 tagger models / wd14 taggerのモデルを再ダウンロードします") + parser.add_argument("--thresh", type=float, default=0.35, help="threshold of confidence to add a tag / タグを追加するか判定する閾値") + parser.add_argument("--batch_size", type=int, default=1, help="batch size in inference / 推論時のバッチサイズ") + parser.add_argument("--caption_extention", type=str, default=None, + help="extension of caption file (for backward compatibility) / 出力されるキャプションファイルの拡張子(スペルミスしていたのを残してあります)") + parser.add_argument("--caption_extension", type=str, default=".txt", help="extension of caption file / 出力されるキャプションファイルの拡張子") + parser.add_argument("--debug", action="store_true", help="debug mode") + + args = parser.parse_args() + + # スペルミスしていたオプションを復元する + if args.caption_extention is not None: + args.caption_extension = args.caption_extention + + main(args) diff --git a/finetune_gui.py b/finetune_gui.py new file mode 100644 index 0000000..bb97739 --- /dev/null +++ b/finetune_gui.py @@ -0,0 +1,795 @@ +import gradio as gr +import json +import math +import os +import subprocess +import pathlib +import shutil + +# from easygui import fileopenbox, filesavebox, diropenbox, msgbox +from library.basic_caption_gui import gradio_basic_caption_gui_tab +from library.convert_model_gui import gradio_convert_model_tab +from library.blip_caption_gui import gradio_blip_caption_gui_tab +from library.wd14_caption_gui import gradio_wd14_caption_gui_tab +from library.common_gui import ( + get_folder_path, + get_file_path, + get_saveasfile_path, +) + +folder_symbol = '\U0001f4c2' # 📂 +refresh_symbol = '\U0001f504' # 🔄 +save_style_symbol = '\U0001f4be' # 💾 +document_symbol = '\U0001F4C4' # 📄 + + +def save_configuration( + save_as, + file_path, + pretrained_model_name_or_path, + v2, + v_parameterization, + train_dir, + image_folder, + output_dir, + logging_dir, + max_resolution, + learning_rate, + lr_scheduler, + lr_warmup, + dataset_repeats, + train_batch_size, + epoch, + save_every_n_epochs, + mixed_precision, + save_precision, + seed, + num_cpu_threads_per_process, + train_text_encoder, + create_buckets, + create_caption, + train, + save_model_as, + caption_extension, +): + original_file_path = file_path + + save_as_bool = True if save_as.get('label') == 'True' else False + + if save_as_bool: + print('Save as...') + file_path = get_saveasfile_path(file_path) + else: + print('Save...') + if file_path == None or file_path == '': + file_path = get_saveasfile_path(file_path) + + # print(file_path) + + if file_path == None: + return original_file_path + + # Return the values of the variables as a dictionary + variables = { + 'pretrained_model_name_or_path': pretrained_model_name_or_path, + 'v2': v2, + 'v_parameterization': v_parameterization, + 'train_dir': train_dir, + 'image_folder': image_folder, + 'output_dir': output_dir, + 'logging_dir': logging_dir, + 'max_resolution': max_resolution, + 'learning_rate': learning_rate, + 'lr_scheduler': lr_scheduler, + 'lr_warmup': lr_warmup, + 'dataset_repeats': dataset_repeats, + 'train_batch_size': train_batch_size, + 'epoch': epoch, + 'save_every_n_epochs': save_every_n_epochs, + 'mixed_precision': mixed_precision, + 'save_precision': save_precision, + 'seed': seed, + 'num_cpu_threads_per_process': num_cpu_threads_per_process, + 'train_text_encoder': train_text_encoder, + 'create_buckets': create_buckets, + 'create_caption': create_caption, + 'train': train, + 'save_model_as': save_model_as, + 'caption_extension': caption_extension, + } + + # Save the data to the selected file + # with open(file_path, 'w') as file: + # json.dump(variables, file) + # msgbox('File was saved...') + + return file_path + + +def open_config_file( + file_path, + pretrained_model_name_or_path, + v2, + v_parameterization, + train_dir, + image_folder, + output_dir, + logging_dir, + max_resolution, + learning_rate, + lr_scheduler, + lr_warmup, + dataset_repeats, + train_batch_size, + epoch, + save_every_n_epochs, + mixed_precision, + save_precision, + seed, + num_cpu_threads_per_process, + train_text_encoder, + create_buckets, + create_caption, + train, + save_model_as, + caption_extension, +): + original_file_path = file_path + file_path = get_file_path(file_path) + + if file_path != '' and file_path != None: + print(file_path) + # load variables from JSON file + with open(file_path, 'r') as f: + my_data = json.load(f) + else: + file_path = original_file_path # In case a file_path was provided and the user decide to cancel the open action + my_data = {} + + # Return the values of the variables as a dictionary + return ( + file_path, + my_data.get( + 'pretrained_model_name_or_path', pretrained_model_name_or_path + ), + my_data.get('v2', v2), + my_data.get('v_parameterization', v_parameterization), + my_data.get('train_dir', train_dir), + my_data.get('image_folder', image_folder), + my_data.get('output_dir', output_dir), + my_data.get('logging_dir', logging_dir), + my_data.get('max_resolution', max_resolution), + my_data.get('learning_rate', learning_rate), + my_data.get('lr_scheduler', lr_scheduler), + my_data.get('lr_warmup', lr_warmup), + my_data.get('dataset_repeats', dataset_repeats), + my_data.get('train_batch_size', train_batch_size), + my_data.get('epoch', epoch), + my_data.get('save_every_n_epochs', save_every_n_epochs), + my_data.get('mixed_precision', mixed_precision), + my_data.get('save_precision', save_precision), + my_data.get('seed', seed), + my_data.get( + 'num_cpu_threads_per_process', num_cpu_threads_per_process + ), + my_data.get('train_text_encoder', train_text_encoder), + my_data.get('create_buckets', create_buckets), + my_data.get('create_caption', create_caption), + my_data.get('train', train), + my_data.get('save_model_as', save_model_as), + my_data.get('caption_extension', caption_extension), + ) + + +def train_model( + generate_caption_database, + generate_image_buckets, + train, + pretrained_model_name_or_path, + v2, + v_parameterization, + train_dir, + image_folder, + output_dir, + logging_dir, + max_resolution, + learning_rate, + lr_scheduler, + lr_warmup, + dataset_repeats, + train_batch_size, + epoch, + save_every_n_epochs, + mixed_precision, + save_precision, + seed, + num_cpu_threads_per_process, + train_text_encoder, + save_model_as, + caption_extension, +): + def save_inference_file(output_dir, v2, v_parameterization): + # Copy inference model for v2 if required + if v2 and v_parameterization: + print(f'Saving v2-inference-v.yaml as {output_dir}/last.yaml') + shutil.copy( + f'./v2_inference/v2-inference-v.yaml', + f'{output_dir}/last.yaml', + ) + elif v2: + print(f'Saving v2-inference.yaml as {output_dir}/last.yaml') + shutil.copy( + f'./v2_inference/v2-inference.yaml', + f'{output_dir}/last.yaml', + ) + + # create caption json file + if generate_caption_database: + if not os.path.exists(train_dir): + os.mkdir(train_dir) + + run_cmd = ( + f'./venv/Scripts/python.exe finetune/merge_captions_to_metadata.py' + ) + if caption_extension == '': + run_cmd += f' --caption_extension=".txt"' + else: + run_cmd += f' --caption_extension={caption_extension}' + run_cmd += f' {image_folder}' + run_cmd += f' {train_dir}/meta_cap.json' + run_cmd += f' --full_path' + + print(run_cmd) + + # Run the command + subprocess.run(run_cmd) + + # create images buckets + if generate_image_buckets: + command = [ + './venv/Scripts/python.exe', + 'finetune/prepare_buckets_latents.py', + image_folder, + '{}/meta_cap.json'.format(train_dir), + '{}/meta_lat.json'.format(train_dir), + pretrained_model_name_or_path, + '--batch_size', + '4', + '--max_resolution', + max_resolution, + '--mixed_precision', + mixed_precision, + '--full_path', + ] + + print(command) + + # Run the command + subprocess.run(command) + + if train: + image_num = len( + [f for f in os.listdir(image_folder) if f.endswith('.npz')] + ) + print(f'image_num = {image_num}') + + repeats = int(image_num) * int(dataset_repeats) + print(f'repeats = {str(repeats)}') + + # calculate max_train_steps + max_train_steps = int( + math.ceil(float(repeats) / int(train_batch_size) * int(epoch)) + ) + print(f'max_train_steps = {max_train_steps}') + + lr_warmup_steps = round( + float(int(lr_warmup) * int(max_train_steps) / 100) + ) + print(f'lr_warmup_steps = {lr_warmup_steps}') + + run_cmd = f'accelerate launch --num_cpu_threads_per_process={num_cpu_threads_per_process} "./fine_tune.py"' + if v2: + run_cmd += ' --v2' + if v_parameterization: + run_cmd += ' --v_parameterization' + if train_text_encoder: + run_cmd += ' --train_text_encoder' + run_cmd += ( + f' --pretrained_model_name_or_path={pretrained_model_name_or_path}' + ) + run_cmd += f' --in_json={train_dir}/meta_lat.json' + run_cmd += f' --train_data_dir={image_folder}' + run_cmd += f' --output_dir={output_dir}' + if not logging_dir == '': + run_cmd += f' --logging_dir={logging_dir}' + run_cmd += f' --train_batch_size={train_batch_size}' + run_cmd += f' --dataset_repeats={dataset_repeats}' + run_cmd += f' --learning_rate={learning_rate}' + run_cmd += f' --lr_scheduler={lr_scheduler}' + run_cmd += f' --lr_warmup_steps={lr_warmup_steps}' + run_cmd += f' --max_train_steps={max_train_steps}' + run_cmd += f' --use_8bit_adam' + run_cmd += f' --xformers' + run_cmd += f' --mixed_precision={mixed_precision}' + run_cmd += f' --save_every_n_epochs={save_every_n_epochs}' + run_cmd += f' --seed={seed}' + run_cmd += f' --save_precision={save_precision}' + if not save_model_as == 'same as source model': + run_cmd += f' --save_model_as={save_model_as}' + + print(run_cmd) + # Run the command + subprocess.run(run_cmd) + + # check if output_dir/last is a folder... therefore it is a diffuser model + last_dir = pathlib.Path(f'{output_dir}/last') + + if not last_dir.is_dir(): + # Copy inference model for v2 if required + save_inference_file(output_dir, v2, v_parameterization) + + +def set_pretrained_model_name_or_path_input(value, v2, v_parameterization): + # define a list of substrings to search for + substrings_v2 = [ + 'stabilityai/stable-diffusion-2-1-base', + 'stabilityai/stable-diffusion-2-base', + ] + + # check if $v2 and $v_parameterization are empty and if $pretrained_model_name_or_path contains any of the substrings in the v2 list + if str(value) in substrings_v2: + print('SD v2 model detected. Setting --v2 parameter') + v2 = True + v_parameterization = False + + return value, v2, v_parameterization + + # define a list of substrings to search for v-objective + substrings_v_parameterization = [ + 'stabilityai/stable-diffusion-2-1', + 'stabilityai/stable-diffusion-2', + ] + + # check if $v2 and $v_parameterization are empty and if $pretrained_model_name_or_path contains any of the substrings in the v_parameterization list + if str(value) in substrings_v_parameterization: + print( + 'SD v2 v_parameterization detected. Setting --v2 parameter and --v_parameterization' + ) + v2 = True + v_parameterization = True + + return value, v2, v_parameterization + + # define a list of substrings to v1.x + substrings_v1_model = [ + 'CompVis/stable-diffusion-v1-4', + 'runwayml/stable-diffusion-v1-5', + ] + + if str(value) in substrings_v1_model: + v2 = False + v_parameterization = False + + return value, v2, v_parameterization + + if value == 'custom': + value = '' + v2 = False + v_parameterization = False + + return value, v2, v_parameterization + + +def remove_doublequote(file_path): + if file_path != None: + file_path = file_path.replace('"', '') + + return file_path + + +css = '' + +if os.path.exists('./style.css'): + with open(os.path.join('./style.css'), 'r', encoding='utf8') as file: + print('Load CSS...') + css += file.read() + '\n' + +interface = gr.Blocks(css=css) + +with interface: + dummy_true = gr.Label(value=True, visible=False) + dummy_false = gr.Label(value=False, visible=False) + with gr.Tab('Finetuning'): + gr.Markdown('Enter kohya finetuner parameter using this interface.') + with gr.Accordion('Configuration File Load/Save', open=False): + with gr.Row(): + button_open_config = gr.Button( + f'Open {folder_symbol}', elem_id='open_folder' + ) + button_save_config = gr.Button( + f'Save {save_style_symbol}', elem_id='open_folder' + ) + button_save_as_config = gr.Button( + f'Save as... {save_style_symbol}', elem_id='open_folder' + ) + config_file_name = gr.Textbox( + label='', placeholder='type file path or use buttons...' + ) + config_file_name.change( + remove_doublequote, + inputs=[config_file_name], + outputs=[config_file_name], + ) + with gr.Tab('Source model'): + # Define the input elements + with gr.Row(): + pretrained_model_name_or_path_input = gr.Textbox( + label='Pretrained model name or path', + placeholder='enter the path to custom model or name of pretrained model', + ) + pretrained_model_name_or_path_file = gr.Button( + document_symbol, elem_id='open_folder_small' + ) + pretrained_model_name_or_path_file.click( + get_file_path, + inputs=pretrained_model_name_or_path_input, + outputs=pretrained_model_name_or_path_input, + ) + pretrained_model_name_or_path_folder = gr.Button( + folder_symbol, elem_id='open_folder_small' + ) + pretrained_model_name_or_path_folder.click( + get_folder_path, + inputs=pretrained_model_name_or_path_input, + outputs=pretrained_model_name_or_path_input, + ) + model_list = gr.Dropdown( + label='(Optional) Model Quick Pick', + choices=[ + 'custom', + 'stabilityai/stable-diffusion-2-1-base', + 'stabilityai/stable-diffusion-2-base', + 'stabilityai/stable-diffusion-2-1', + 'stabilityai/stable-diffusion-2', + 'runwayml/stable-diffusion-v1-5', + 'CompVis/stable-diffusion-v1-4', + ], + ) + save_model_as_dropdown = gr.Dropdown( + label='Save trained model as', + choices=[ + 'same as source model', + 'ckpt', + 'diffusers', + 'diffusers_safetensors', + 'safetensors', + ], + value='same as source model', + ) + + with gr.Row(): + v2_input = gr.Checkbox(label='v2', value=True) + v_parameterization_input = gr.Checkbox( + label='v_parameterization', value=False + ) + model_list.change( + set_pretrained_model_name_or_path_input, + inputs=[model_list, v2_input, v_parameterization_input], + outputs=[ + pretrained_model_name_or_path_input, + v2_input, + v_parameterization_input, + ], + ) + with gr.Tab('Directories'): + with gr.Row(): + train_dir_input = gr.Textbox( + label='Training config folder', + placeholder='folder where the training configuration files will be saved', + ) + train_dir_folder = gr.Button( + folder_symbol, elem_id='open_folder_small' + ) + train_dir_folder.click( + get_folder_path, outputs=train_dir_input + ) + + image_folder_input = gr.Textbox( + label='Training Image folder', + placeholder='folder where the training images are located', + ) + image_folder_input_folder = gr.Button( + folder_symbol, elem_id='open_folder_small' + ) + image_folder_input_folder.click( + get_folder_path, outputs=image_folder_input + ) + with gr.Row(): + output_dir_input = gr.Textbox( + label='Output folder', + placeholder='folder where the model will be saved', + ) + output_dir_input_folder = gr.Button( + folder_symbol, elem_id='open_folder_small' + ) + output_dir_input_folder.click( + get_folder_path, outputs=output_dir_input + ) + + logging_dir_input = gr.Textbox( + label='Logging folder', + placeholder='Optional: enable logging and output TensorBoard log to this folder', + ) + logging_dir_input_folder = gr.Button( + folder_symbol, elem_id='open_folder_small' + ) + logging_dir_input_folder.click( + get_folder_path, outputs=logging_dir_input + ) + train_dir_input.change( + remove_doublequote, + inputs=[train_dir_input], + outputs=[train_dir_input], + ) + image_folder_input.change( + remove_doublequote, + inputs=[image_folder_input], + outputs=[image_folder_input], + ) + output_dir_input.change( + remove_doublequote, + inputs=[output_dir_input], + outputs=[output_dir_input], + ) + with gr.Tab('Training parameters'): + with gr.Row(): + learning_rate_input = gr.Textbox( + label='Learning rate', value=1e-6 + ) + lr_scheduler_input = gr.Dropdown( + label='LR Scheduler', + choices=[ + 'constant', + 'constant_with_warmup', + 'cosine', + 'cosine_with_restarts', + 'linear', + 'polynomial', + ], + value='constant', + ) + lr_warmup_input = gr.Textbox(label='LR warmup', value=0) + with gr.Row(): + dataset_repeats_input = gr.Textbox( + label='Dataset repeats', value=40 + ) + train_batch_size_input = gr.Slider( + minimum=1, + maximum=32, + label='Train batch size', + value=1, + step=1, + ) + epoch_input = gr.Textbox(label='Epoch', value=1) + save_every_n_epochs_input = gr.Textbox( + label='Save every N epochs', value=1 + ) + with gr.Row(): + mixed_precision_input = gr.Dropdown( + label='Mixed precision', + choices=[ + 'no', + 'fp16', + 'bf16', + ], + value='fp16', + ) + save_precision_input = gr.Dropdown( + label='Save precision', + choices=[ + 'float', + 'fp16', + 'bf16', + ], + value='fp16', + ) + num_cpu_threads_per_process_input = gr.Slider( + minimum=1, + maximum=os.cpu_count(), + step=1, + label='Number of CPU threads per process', + value=os.cpu_count(), + ) + with gr.Row(): + seed_input = gr.Textbox(label='Seed', value=1234) + max_resolution_input = gr.Textbox( + label='Max resolution', value='512,512' + ) + with gr.Row(): + caption_extention_input = gr.Textbox( + label='Caption Extension', + placeholder='(Optional) Extension for caption files. default: .txt', + ) + train_text_encoder_input = gr.Checkbox( + label='Train text encoder', value=True + ) + with gr.Box(): + with gr.Row(): + create_caption = gr.Checkbox( + label='Generate caption database', value=True + ) + create_buckets = gr.Checkbox( + label='Generate image buckets', value=True + ) + train = gr.Checkbox(label='Train model', value=True) + + button_run = gr.Button('Run') + + button_run.click( + train_model, + inputs=[ + create_caption, + create_buckets, + train, + pretrained_model_name_or_path_input, + v2_input, + v_parameterization_input, + train_dir_input, + image_folder_input, + output_dir_input, + logging_dir_input, + max_resolution_input, + learning_rate_input, + lr_scheduler_input, + lr_warmup_input, + dataset_repeats_input, + train_batch_size_input, + epoch_input, + save_every_n_epochs_input, + mixed_precision_input, + save_precision_input, + seed_input, + num_cpu_threads_per_process_input, + train_text_encoder_input, + save_model_as_dropdown, + caption_extention_input, + ], + ) + + button_open_config.click( + open_config_file, + inputs=[ + config_file_name, + pretrained_model_name_or_path_input, + v2_input, + v_parameterization_input, + train_dir_input, + image_folder_input, + output_dir_input, + logging_dir_input, + max_resolution_input, + learning_rate_input, + lr_scheduler_input, + lr_warmup_input, + dataset_repeats_input, + train_batch_size_input, + epoch_input, + save_every_n_epochs_input, + mixed_precision_input, + save_precision_input, + seed_input, + num_cpu_threads_per_process_input, + train_text_encoder_input, + create_buckets, + create_caption, + train, + save_model_as_dropdown, + caption_extention_input, + ], + outputs=[ + config_file_name, + pretrained_model_name_or_path_input, + v2_input, + v_parameterization_input, + train_dir_input, + image_folder_input, + output_dir_input, + logging_dir_input, + max_resolution_input, + learning_rate_input, + lr_scheduler_input, + lr_warmup_input, + dataset_repeats_input, + train_batch_size_input, + epoch_input, + save_every_n_epochs_input, + mixed_precision_input, + save_precision_input, + seed_input, + num_cpu_threads_per_process_input, + train_text_encoder_input, + create_buckets, + create_caption, + train, + save_model_as_dropdown, + caption_extention_input, + ], + ) + + button_save_config.click( + save_configuration, + inputs=[ + dummy_false, + config_file_name, + pretrained_model_name_or_path_input, + v2_input, + v_parameterization_input, + train_dir_input, + image_folder_input, + output_dir_input, + logging_dir_input, + max_resolution_input, + learning_rate_input, + lr_scheduler_input, + lr_warmup_input, + dataset_repeats_input, + train_batch_size_input, + epoch_input, + save_every_n_epochs_input, + mixed_precision_input, + save_precision_input, + seed_input, + num_cpu_threads_per_process_input, + train_text_encoder_input, + create_buckets, + create_caption, + train, + save_model_as_dropdown, + caption_extention_input, + ], + outputs=[config_file_name], + ) + + button_save_as_config.click( + save_configuration, + inputs=[ + dummy_true, + config_file_name, + pretrained_model_name_or_path_input, + v2_input, + v_parameterization_input, + train_dir_input, + image_folder_input, + output_dir_input, + logging_dir_input, + max_resolution_input, + learning_rate_input, + lr_scheduler_input, + lr_warmup_input, + dataset_repeats_input, + train_batch_size_input, + epoch_input, + save_every_n_epochs_input, + mixed_precision_input, + save_precision_input, + seed_input, + num_cpu_threads_per_process_input, + train_text_encoder_input, + create_buckets, + create_caption, + train, + save_model_as_dropdown, + caption_extention_input, + ], + outputs=[config_file_name], + ) + + with gr.Tab('Utilities'): + gradio_basic_caption_gui_tab() + gradio_blip_caption_gui_tab() + gradio_wd14_caption_gui_tab() + gradio_convert_model_tab() + + +# Show the interface +interface.launch() diff --git a/library/blip_caption_gui.py b/library/blip_caption_gui.py index 6583b9f..aadb394 100644 --- a/library/blip_caption_gui.py +++ b/library/blip_caption_gui.py @@ -28,7 +28,7 @@ def caption_images( return print(f'Captioning files in {train_data_dir}...') - run_cmd = f'.\\venv\\Scripts\\python.exe "./BLIP_caption/make_captions.py"' + run_cmd = f'.\\venv\\Scripts\\python.exe "finetune/make_captions.py"' run_cmd += f' --batch_size="{int(batch_size)}"' run_cmd += f' --num_beams="{int(num_beams)}"' run_cmd += f' --top_p="{top_p}"' @@ -39,7 +39,7 @@ def caption_images( if caption_file_ext != '': run_cmd += f' --caption_extension="{caption_file_ext}"' run_cmd += f' "{train_data_dir}"' - run_cmd += f' "https://storage.googleapis.com/sfr-vision-language-research/BLIP/models/model_large_caption.pth"' + run_cmd += f' --caption_weights="https://storage.googleapis.com/sfr-vision-language-research/BLIP/models/model_large_caption.pth"' print(run_cmd) diff --git a/library/common_gui.py b/library/common_gui.py index bf6f291..ed9a581 100644 --- a/library/common_gui.py +++ b/library/common_gui.py @@ -72,6 +72,10 @@ def get_saveasfile_path(file_path='', defaultextension='.json'): def add_pre_postfix( folder='', prefix='', postfix='', caption_file_ext='.caption' ): + # set caption extention to default in case it was not provided + if caption_file_ext == '': + caption_file_ext = '.caption' + files = [f for f in os.listdir(folder) if f.endswith(caption_file_ext)] if not prefix == '': prefix = f'{prefix} ' diff --git a/library/wd14_caption_gui.py b/library/wd14_caption_gui.py index b575ec2..13645ce 100644 --- a/library/wd14_caption_gui.py +++ b/library/wd14_caption_gui.py @@ -16,7 +16,7 @@ def caption_images(train_data_dir, caption_extension, batch_size, thresh): return print(f'Captioning files in {train_data_dir}...') - run_cmd = f'accelerate launch "./script/tag_images_by_wd14_tagger.py"' + run_cmd = f'accelerate launch "./finetune/tag_images_by_wd14_tagger.py"' run_cmd += f' --batch_size="{int(batch_size)}"' run_cmd += f' --thresh="{thresh}"' if caption_extension != '': diff --git a/mytraining.ps b/mytraining.ps deleted file mode 100644 index 296d96c..0000000 --- a/mytraining.ps +++ /dev/null @@ -1,609 +0,0 @@ -accelerate launch --num_cpu_threads_per_process 6 train_db_fixed_v6.py ` - --pretrained_model_name_or_path="D:\models\v1-5-pruned.ckpt" ` - --train_data_dir="D:\dreambooth\train_bernard\train_man" ` - --reg_data_dir="D:\dreambooth\train_bernard\reg_man" ` - --output_dir="D:\dreambooth\train_bernard" ` - --prior_loss_weight=1.0 ` - --resolution="512,512" ` - --train_batch_size=1 ` - --learning_rate=1e-6 ` - --max_train_steps=3000 ` - --use_8bit_adam ` - --xformers ` - --mixed_precision="fp16" ` - --cache_latents ` - --gradient_checkpointing ` - --save_every_n_epochs=1 - -accelerate launch --num_cpu_threads_per_process 6 train_db_fixed_v6.py ` - --pretrained_model_name_or_path="D:\models\bernard\asd man-3000-remgb-sd15.ckpt" ` - --train_data_dir="D:\dreambooth\train_bernard\train_man" ` - --reg_data_dir="D:\dreambooth\train_bernard\reg_man" ` - --output_dir="D:\dreambooth\train_bernard" ` - --prior_loss_weight=1.0 ` - --resolution="512,512" ` - --train_batch_size=1 ` - --learning_rate=1e-6 ` - --max_train_steps=1500 ` - --use_8bit_adam ` - --xformers ` - --mixed_precision="fp16" ` - --cache_latents ` - --gradient_checkpointing ` - --save_every_n_epochs=1 - -accelerate launch --num_cpu_threads_per_process 6 train_db_fixed_v6.py ` - --pretrained_model_name_or_path="D:\models\v1-5-pruned-mse-vae.ckpt" ` - --train_data_dir="D:\dreambooth\train_bernard\train_man" ` - --reg_data_dir="D:\dreambooth\train_bernard\reg_man" ` - --output_dir="D:\dreambooth\train_bernard" ` - --prior_loss_weight=1.0 ` - --resolution="512,512" ` - --train_batch_size=1 ` - --learning_rate=1e-6 ` - --max_train_steps=4500 ` - --use_8bit_adam ` - --xformers ` - --mixed_precision="fp16" ` - --cache_latents ` - --gradient_checkpointing ` - --no_token_padding ` - --save_every_n_epochs=1 - -accelerate launch --num_cpu_threads_per_process 6 train_db_fixed_v6.py ` - --pretrained_model_name_or_path="D:\models\v1-5-pruned-mse-vae.ckpt" ` - --train_data_dir="D:\dreambooth\source\alex\train" ` - --output_dir="D:\dreambooth\train_alex" ` - --prior_loss_weight=1.0 ` - --resolution="448,640" ` - --train_batch_size=8 ` - --learning_rate=1e-6 ` - --max_train_steps=4500 ` - --use_8bit_adam ` - --xformers ` - --mixed_precision="fp16" ` - --cache_latents ` - --save_every_n_epochs=1 ` - --shuffle_caption - -# -fine_tuning - -accelerate launch --num_cpu_threads_per_process 6 train_db_fixed_v6.py ` - --pretrained_model_name_or_path="D:\models\v1-5-pruned-mse-vae.ckpt" ` - --train_data_dir="D:\dreambooth\source\alex\train\50_portrait-pp" ` - --output_dir="D:\dreambooth\train_alex" ` - --resolution="448,640" ` - --train_batch_size=8 ` - --learning_rate=1e-6 ` - --max_train_steps=4500 ` - --use_8bit_adam ` - --xformers ` - --mixed_precision="fp16" ` - --cache_latents ` - --save_every_n_epochs=1 ` - --fine_tuning ` - --shuffle_caption - -Resume: - -accelerate launch --num_cpu_threads_per_process 6 train_db_fixed_v6-ber.py ` - --pretrained_model_name_or_path="D:\models\v1-5-pruned-mse-vae.ckpt" ` - --train_data_dir="D:\dreambooth\source\alet_et_bernard\landscape-pp" ` - --output_dir="D:\dreambooth\train_alex_and_bernard" ` - --resolution="640,448" ` - --train_batch_size=8 ` - --learning_rate=1e-6 ` - --max_train_steps=550 ` - --use_8bit_adam ` - --xformers ` - --mixed_precision="fp16" ` - --cache_latents ` - --save_every_n_epochs=1 ` - --fine_tuning ` - --fine_tuning_repeat=200 ` - --seed=23 ` - --save_half - -# Mollie Monger - -e1: - -accelerate launch --num_cpu_threads_per_process 6 train_db_fixed_v6-ber.py ` - --pretrained_model_name_or_path="D:\models\v1-5-pruned-mse-vae.ckpt" ` - --train_data_dir="D:\dreambooth\train_mollie_monger\landscape-pp" ` - --output_dir="D:\dreambooth\train_mollie_monger\output" ` - --resolution="640,448" ` - --train_batch_size=8 ` - --learning_rate=1e-6 ` - --max_train_steps=625 ` - --use_8bit_adam ` - --xformers ` - --mixed_precision="fp16" ` - --cache_latents ` - --save_every_n_epochs=1 ` - --fine_tuning ` - --fine_tuning_repeat=200 ` - --seed=23 ` - --save_half - -accelerate launch --num_cpu_threads_per_process 6 train_db_fixed_v6-ber.py ` - --pretrained_model_name_or_path="D:\models\mollie_monger-kohya-l-200-sd15.ckpt" ` - --train_data_dir="D:\dreambooth\train_mollie_monger\portrait-pp" ` - --output_dir="D:\dreambooth\train_mollie_monger\output" ` - --resolution="448,640" ` - --train_batch_size=8 ` - --learning_rate=1e-6 ` - --max_train_steps=1275 ` - --use_8bit_adam ` - --xformers ` - --mixed_precision="fp16" ` - --cache_latents ` - --save_every_n_epochs=1 ` - --fine_tuning ` - --fine_tuning_repeat=200 ` - --seed=23 ` - --save_half - -accelerate launch --num_cpu_threads_per_process 6 train_db_fixed_v6-ber.py ` - --pretrained_model_name_or_path="D:\models\mollie_monger-kohya-l+p-200-sd15.ckpt" ` - --train_data_dir="D:\dreambooth\train_mollie_monger\square-pp" ` - --output_dir="D:\dreambooth\train_mollie_monger\output" ` - --resolution="512,512" ` - --train_batch_size=8 ` - --learning_rate=1e-6 ` - --max_train_steps=500 ` - --use_8bit_adam ` - --xformers ` - --mixed_precision="fp16" ` - --cache_latents ` - --save_every_n_epochs=1 ` - --fine_tuning ` - --fine_tuning_repeat=200 ` - --seed=23 ` - --save_half - -e2: - -accelerate launch --num_cpu_threads_per_process 6 train_db_fixed_v6-ber.py ` - --pretrained_model_name_or_path="D:\models\mollie_monger\mollie_monger-kohya-l+p+s-r200-e1-sd15.ckpt" ` - --train_data_dir="D:\dreambooth\train_mollie_monger\landscape-pp" ` - --output_dir="D:\dreambooth\train_mollie_monger\output" ` - --resolution="640,448" ` - --train_batch_size=8 ` - --learning_rate=1e-6 ` - --max_train_steps=625 ` - --use_8bit_adam ` - --xformers ` - --mixed_precision="fp16" ` - --cache_latents ` - --save_every_n_epochs=1 ` - --fine_tuning ` - --fine_tuning_repeat=200 ` - --seed=23 ` - --save_half - -accelerate launch --num_cpu_threads_per_process 6 train_db_fixed_v6-ber.py ` - --pretrained_model_name_or_path="D:\models\mollie_monger\last.ckpt" ` - --train_data_dir="D:\dreambooth\train_mollie_monger\portrait-pp" ` - --output_dir="D:\dreambooth\train_mollie_monger\output" ` - --resolution="448,640" ` - --train_batch_size=8 ` - --learning_rate=1e-6 ` - --max_train_steps=1275 ` - --use_8bit_adam ` - --xformers ` - --mixed_precision="fp16" ` - --cache_latents ` - --save_every_n_epochs=1 ` - --fine_tuning ` - --fine_tuning_repeat=200 ` - --seed=23 ` - --save_half - -accelerate launch --num_cpu_threads_per_process 6 train_db_fixed_v6-ber.py ` - --pretrained_model_name_or_path="D:\models\mollie_monger\last.ckpt" ` - --train_data_dir="D:\dreambooth\train_mollie_monger\square-pp" ` - --output_dir="D:\dreambooth\train_mollie_monger\output" ` - --resolution="512,512" ` - --train_batch_size=8 ` - --learning_rate=1e-6 ` - --max_train_steps=500 ` - --use_8bit_adam ` - --xformers ` - --mixed_precision="fp16" ` - --cache_latents ` - --save_every_n_epochs=1 ` - --fine_tuning ` - --fine_tuning_repeat=200 ` - --seed=23 ` - --save_half - - - Midjourney images download: - - https://storage.googleapis.com/dream-machines-output/2932e6e4-ddef-410e-947b-2a6275e31f35/0_3.png - - # Midjourney - - accelerate launch --num_cpu_threads_per_process 6 train_db_fixed_v6-ber.py ` - --pretrained_model_name_or_path="D:\models\v1-5-pruned-mse-vae.ckpt" ` - --train_data_dir="D:\dreambooth\train_midjourney_v4\all data" ` - --output_dir="D:\dreambooth\train_midjourney_v4\model" ` - --resolution="512,512" ` - --train_batch_size=8 ` - --learning_rate=1e-6 ` - --max_train_steps=528 ` - --use_8bit_adam ` - --xformers ` - --mixed_precision="fp16" ` - --cache_latents ` - --save_every_n_epochs=1 ` - --fine_tuning ` - --fine_tuning_repeat=12 ` - --seed=23 ` - --save_half - -accelerate launch --num_cpu_threads_per_process 6 train_db_fixed_v6-ber.py ` - --pretrained_model_name_or_path="D:\models\midjourney_v4-khoya-r100-e1-sd15.ckpt" ` - --train_data_dir="D:\dreambooth\train_midjourney_v4\data2" ` - --output_dir="D:\dreambooth\train_midjourney_v4\model" ` - --resolution="512,512" ` - --train_batch_size=8 ` - --learning_rate=1e-6 ` - --max_train_steps=850 ` - --use_8bit_adam ` - --xformers ` - --mixed_precision="fp16" ` - --cache_latents ` - --save_every_n_epochs=1 ` - --fine_tuning ` - --fine_tuning_repeat=100 ` - --seed=23 ` - --save_half - -accelerate launch --num_cpu_threads_per_process 6 train_db_fixed_v6-ber.py ` - --pretrained_model_name_or_path="D:\models\midjourney_v4_finetune\epoch-000001.ckpt" ` - --train_data_dir="D:\dreambooth\train_midjourney_v4\newdata3" ` - --output_dir="D:\dreambooth\train_midjourney_v4\model" ` - --resolution="512,512" ` - --train_batch_size=8 ` - --learning_rate=1e-6 ` - --max_train_steps=159 ` - --use_8bit_adam ` - --xformers ` - --mixed_precision="fp16" ` - --cache_latents ` - --save_every_n_epochs=1 ` - --fine_tuning ` - --fine_tuning_repeat=24 ` - --seed=23 ` - --save_half - -# train n - - # Midjourney - - accelerate launch --num_cpu_threads_per_process 6 train_db_fixed_v6-ber.py ` - --pretrained_model_name_or_path="D:\dreambooth\train_childrens_drawings\model\last2.ckpt" ` - --train_data_dir="D:\dreambooth\train_childrens_drawings\data2-pp" ` - --output_dir="D:\dreambooth\train_childrens_drawings\model" ` - --resolution="704,512" ` - --train_batch_size=8 ` - --learning_rate=1e-6 ` - --max_train_steps=312 ` - --use_8bit_adam ` - --xformers ` - --mixed_precision="fp16" ` - --cache_latents ` - --save_every_n_epochs=1 ` - --fine_tuning ` - --fine_tuning_repeat=48 ` - --seed=23 ` - --save_half - -accelerate launch --num_cpu_threads_per_process 6 train_db_fixed_v7-ber.py ` - --pretrained_model_name_or_path="D:\dreambooth\train_childrens_drawings\model\last2.ckpt" ` - --train_data_dir="D:\dreambooth\train_childrens_drawings\data2-pp" ` - --output_dir="D:\dreambooth\train_childrens_drawings\model" ` - --resolution="704,512" ` - --train_batch_size=8 ` - --learning_rate=1e-6 ` - --max_train_steps=312 ` - --use_8bit_adam ` - --xformers ` - --mixed_precision="fp16" ` - --cache_latents ` - --save_every_n_epochs=1 ` - --fine_tuning ` - --dataset_repeats=48 ` - --seed=23 ` - --save_half - -# twq - -accelerate launch --num_cpu_threads_per_process 6 train_db_fixed_v7-ber.py ` - --pretrained_model_name_or_path="D:\models\v1-5-pruned-mse-vae.ckpt" ` - --train_data_dir="D:\dreambooth\source\bernardv2-ft" ` - --output_dir="D:\dreambooth\train_bernard\model" ` - --resolution="512,512" ` - --train_batch_size=8 ` - --learning_rate=1e-6 ` - --max_train_steps=720 ` - --use_8bit_adam ` - --xformers ` - --mixed_precision="fp16" ` - --cache_latents ` - --save_every_n_epochs=1 ` - --fine_tuning ` - --dataset_repeats=48 ` - --save_half - -# the white queen - -accelerate launch --num_cpu_threads_per_process 6 train_db_fixed_v7-ber.py ` - --pretrained_model_name_or_path="D:\models\v1-5-pruned-mse-vae.ckpt" ` - --train_data_dir="D:\dreambooth\training_twq\the_white_queen\landscape-ft" ` - --output_dir="D:\dreambooth\training_twq\the_white_queen\model+l" ` - --resolution="704,512" ` - --train_batch_size=8 ` - --learning_rate=1e-6 ` - --max_train_steps=520 ` - --use_8bit_adam ` - --xformers ` - --mixed_precision="fp16" ` - --cache_latents ` - --save_every_n_epochs=1 ` - --fine_tuning ` - --dataset_repeats=40 ` - --save_half - -accelerate launch --num_cpu_threads_per_process 6 train_db_fixed_v7-ber.py ` - --pretrained_model_name_or_path="D:\dreambooth\training_twq\the_white_queen\model+l\last.ckpt" ` - --train_data_dir="D:\dreambooth\training_twq\the_white_queen\portrait-ft" ` - --output_dir="D:\dreambooth\training_twq\the_white_queen\model+l+p" ` - --resolution="512,704" ` - --train_batch_size=8 ` - --learning_rate=1e-6 ` - --max_train_steps=260 ` - --use_8bit_adam ` - --xformers ` - --mixed_precision="fp16" ` - --cache_latents ` - --save_every_n_epochs=1 ` - --fine_tuning ` - --dataset_repeats=40 ` - --save_half - -accelerate launch --num_cpu_threads_per_process 6 train_db_fixed_v7-ber.py ` - --pretrained_model_name_or_path="D:\dreambooth\training_twq\the_white_queen\model+l+p\last.ckpt" ` - --train_data_dir="D:\dreambooth\training_twq\the_white_queen\square-ft" ` - --output_dir="D:\dreambooth\training_twq\the_white_queen\model+l+p+s" ` - --resolution="512,512" ` - --train_batch_size=8 ` - --learning_rate=1e-6 ` - --max_train_steps=220 ` - --use_8bit_adam ` - --xformers ` - --mixed_precision="fp16" ` - --cache_latents ` - --save_every_n_epochs=1 ` - --fine_tuning ` - --dataset_repeats=40 ` - --seed=23 ` - --save_half - -# the white queen slow progress init phase - -accelerate launch --num_cpu_threads_per_process 6 train_db_fixed_v7-ber.py ` - --pretrained_model_name_or_path="D:\models\v1-5-pruned-mse-vae.ckpt" ` - --train_data_dir="D:\dreambooth\training_twq\the_white_queen\landscape-ft" ` - --output_dir="D:\dreambooth\training_twq\the_white_queen\model+l" ` - --resolution="704,512" ` - --train_batch_size=8 ` - --learning_rate=1e-6 ` - --max_train_steps=260 ` - --use_8bit_adam ` - --xformers ` - --mixed_precision="fp16" ` - --cache_latents ` - --save_every_n_epochs=1 ` - --fine_tuning ` - --dataset_repeats=80 ` - --save_half - -accelerate launch --num_cpu_threads_per_process 6 train_db_fixed_v7-ber.py ` - --pretrained_model_name_or_path="D:\dreambooth\training_twq\the_white_queen\model+l\last.ckpt" ` - --train_data_dir="D:\dreambooth\training_twq\the_white_queen\portrait-ft" ` - --output_dir="D:\dreambooth\training_twq\the_white_queen\model+l+p" ` - --resolution="512,704" ` - --train_batch_size=8 ` - --learning_rate=1e-6 ` - --max_train_steps=130 ` - --use_8bit_adam ` - --xformers ` - --mixed_precision="fp16" ` - --cache_latents ` - --save_every_n_epochs=1 ` - --fine_tuning ` - --dataset_repeats=80 ` - --save_half - -accelerate launch --num_cpu_threads_per_process 6 train_db_fixed_v7-ber.py ` - --pretrained_model_name_or_path="D:\dreambooth\training_twq\the_white_queen\model+l+p\last.ckpt" ` - --train_data_dir="D:\dreambooth\training_twq\the_white_queen\square-ft" ` - --output_dir="D:\dreambooth\training_twq\the_white_queen\model+l+p+s" ` - --resolution="512,512" ` - --train_batch_size=8 ` - --learning_rate=1e-6 ` - --max_train_steps=90 ` - --use_8bit_adam ` - --xformers ` - --mixed_precision="fp16" ` - --cache_latents ` - --save_every_n_epochs=1 ` - --fine_tuning ` - --dataset_repeats=80 ` - --seed=23 ` - --save_half - -# the white queen slow progress extra steps phase - -accelerate launch --num_cpu_threads_per_process 6 train_db_fixed_v7-ber.py ` - --pretrained_model_name_or_path="D:\dreambooth\training_twq\the_white_queen\model+l+p+s\last.ckpt" ` - --train_data_dir="D:\dreambooth\training_twq\the_white_queen\landscape-ft" ` - --output_dir="D:\dreambooth\training_twq\the_white_queen\model+l" ` - --resolution="704,512" ` - --train_batch_size=8 ` - --learning_rate=1e-6 ` - --max_train_steps=130 ` - --use_8bit_adam ` - --xformers ` - --mixed_precision="fp16" ` - --cache_latents ` - --save_every_n_epochs=1 ` - --fine_tuning ` - --dataset_repeats=40 ` - --save_half - -accelerate launch --num_cpu_threads_per_process 6 train_db_fixed_v7-ber.py ` - --pretrained_model_name_or_path="D:\dreambooth\training_twq\the_white_queen\model+l\last.ckpt" ` - --train_data_dir="D:\dreambooth\training_twq\the_white_queen\portrait-ft" ` - --output_dir="D:\dreambooth\training_twq\the_white_queen\model+l+p" ` - --resolution="512,704" ` - --train_batch_size=8 ` - --learning_rate=1e-6 ` - --max_train_steps=65 ` - --use_8bit_adam ` - --xformers ` - --mixed_precision="fp16" ` - --cache_latents ` - --save_every_n_epochs=1 ` - --fine_tuning ` - --dataset_repeats=40 ` - --save_half - -accelerate launch --num_cpu_threads_per_process 6 train_db_fixed_v7-ber.py ` - --pretrained_model_name_or_path="D:\dreambooth\training_twq\the_white_queen\model+l+p\last.ckpt" ` - --train_data_dir="D:\dreambooth\training_twq\the_white_queen\square-ft" ` - --output_dir="D:\dreambooth\training_twq\the_white_queen\model+l+p+s" ` - --resolution="512,512" ` - --train_batch_size=8 ` - --learning_rate=1e-6 ` - --max_train_steps=45 ` - --use_8bit_adam ` - --xformers ` - --mixed_precision="fp16" ` - --cache_latents ` - --save_every_n_epochs=1 ` - --fine_tuning ` - --dataset_repeats=40 ` - --seed=23 ` - --save_half - -# the queen of heart init phase - -accelerate launch --num_cpu_threads_per_process 6 train_db_fixed_v7-ber.py ` - --pretrained_model_name_or_path="D:\models\v1-5-pruned-mse-vae.ckpt" ` - --train_data_dir="D:\dreambooth\train_qoh\landscape-ft" ` - --output_dir="D:\dreambooth\training_twq\the_white_queen\model+l" ` - --resolution="704,512" ` - --train_batch_size=8 ` - --learning_rate=1e-6 ` - --max_train_steps=260 ` - --use_8bit_adam ` - --xformers ` - --mixed_precision="fp16" ` - --cache_latents ` - --save_every_n_epochs=1 ` - --fine_tuning ` - --dataset_repeats=80 ` - --save_half - -accelerate launch --num_cpu_threads_per_process 6 train_db_fixed_v7-ber.py ` - --pretrained_model_name_or_path="D:\dreambooth\training_twq\the_white_queen\model+l\last.ckpt" ` - --train_data_dir="D:\dreambooth\training_twq\the_white_queen\portrait-ft" ` - --output_dir="D:\dreambooth\training_twq\the_white_queen\model+l+p" ` - --resolution="512,704" ` - --train_batch_size=8 ` - --learning_rate=1e-6 ` - --max_train_steps=130 ` - --use_8bit_adam ` - --xformers ` - --mixed_precision="fp16" ` - --cache_latents ` - --save_every_n_epochs=1 ` - --fine_tuning ` - --dataset_repeats=80 ` - --save_half - -accelerate launch --num_cpu_threads_per_process 6 train_db_fixed_v7-ber.py ` - --pretrained_model_name_or_path="D:\dreambooth\training_twq\the_white_queen\model+l+p\last.ckpt" ` - --train_data_dir="D:\dreambooth\training_twq\the_white_queen\square-ft" ` - --output_dir="D:\dreambooth\training_twq\the_white_queen\model+l+p+s" ` - --resolution="512,512" ` - --train_batch_size=8 ` - --learning_rate=1e-6 ` - --max_train_steps=90 ` - --use_8bit_adam ` - --xformers ` - --mixed_precision="fp16" ` - --cache_latents ` - --save_every_n_epochs=1 ` - --fine_tuning ` - --dataset_repeats=80 ` - --seed=23 ` - --save_half - -# the white queen slow progress extra steps phase - -accelerate launch --num_cpu_threads_per_process 6 train_db_fixed_v7-ber.py ` - --pretrained_model_name_or_path="D:\dreambooth\training_twq\the_white_queen\model+l+p+s\last.ckpt" ` - --train_data_dir="D:\dreambooth\training_twq\the_white_queen\landscape-ft" ` - --output_dir="D:\dreambooth\training_twq\the_white_queen\model+l" ` - --resolution="704,512" ` - --train_batch_size=8 ` - --learning_rate=1e-6 ` - --max_train_steps=130 ` - --use_8bit_adam ` - --xformers ` - --mixed_precision="fp16" ` - --cache_latents ` - --save_every_n_epochs=1 ` - --fine_tuning ` - --dataset_repeats=40 ` - --save_half - -accelerate launch --num_cpu_threads_per_process 6 train_db_fixed_v7-ber.py ` - --pretrained_model_name_or_path="D:\dreambooth\training_twq\the_white_queen\model+l\last.ckpt" ` - --train_data_dir="D:\dreambooth\training_twq\the_white_queen\portrait-ft" ` - --output_dir="D:\dreambooth\training_twq\the_white_queen\model+l+p" ` - --resolution="512,704" ` - --train_batch_size=8 ` - --learning_rate=1e-6 ` - --max_train_steps=65 ` - --use_8bit_adam ` - --xformers ` - --mixed_precision="fp16" ` - --cache_latents ` - --save_every_n_epochs=1 ` - --fine_tuning ` - --dataset_repeats=40 ` - --save_half - -accelerate launch --num_cpu_threads_per_process 6 train_db_fixed_v7-ber.py ` - --pretrained_model_name_or_path="D:\dreambooth\training_twq\the_white_queen\model+l+p\last.ckpt" ` - --train_data_dir="D:\dreambooth\training_twq\the_white_queen\square-ft" ` - --output_dir="D:\dreambooth\training_twq\the_white_queen\model+l+p+s" ` - --resolution="512,512" ` - --train_batch_size=8 ` - --learning_rate=1e-6 ` - --max_train_steps=45 ` - --use_8bit_adam ` - --xformers ` - --mixed_precision="fp16" ` - --cache_latents ` - --save_every_n_epochs=1 ` - --fine_tuning ` - --dataset_repeats=40 ` - --seed=23 ` - --save_half \ No newline at end of file diff --git a/sample/images/1_asd dog/info.txt b/sample/images/1_asd dog/info.txt deleted file mode 100644 index 065f369..0000000 --- a/sample/images/1_asd dog/info.txt +++ /dev/null @@ -1 +0,0 @@ -Put your asd dog images you want to train in this folder \ No newline at end of file diff --git a/sample/regularisation/256_dog/info.txt b/sample/regularisation/256_dog/info.txt deleted file mode 100644 index 785fbda..0000000 --- a/sample/regularisation/256_dog/info.txt +++ /dev/null @@ -1 +0,0 @@ -Put your dog class regularization images in here \ No newline at end of file diff --git a/setup.py b/setup.py index 7bf5483..96d88fb 100644 --- a/setup.py +++ b/setup.py @@ -1,3 +1,3 @@ from setuptools import setup, find_packages -setup(name = "library", packages = find_packages()) \ No newline at end of file +setup(name = "library", version="1.0.0", packages = find_packages()) \ No newline at end of file diff --git a/cudann_1.8 install.py b/tools/cudann_1.8_install.py similarity index 87% rename from cudann_1.8 install.py rename to tools/cudann_1.8_install.py index 8bd6d76..75aef92 100644 --- a/cudann_1.8 install.py +++ b/tools/cudann_1.8_install.py @@ -11,7 +11,7 @@ if sys.version_info < (3, 8): else: import importlib.metadata as importlib_metadata -req_file = os.path.join(os.path.dirname(os.path.realpath(__file__)), "requirements.txt") +req_file = os.path.join(os.path.dirname(os.path.realpath(__file__)), "../requirements.txt") def run(command, desc=None, errdesc=None, custom_env=None): if desc is not None: @@ -83,9 +83,9 @@ check_versions() # Check for "different" B&B Files and copy only if necessary if os.name == "nt": python = sys.executable - bnb_src = os.path.join(os.path.dirname(os.path.realpath(__file__)), "bitsandbytes_windows") + bnb_src = os.path.join(os.path.dirname(os.path.realpath(__file__)), "..\bitsandbytes_windows") bnb_dest = os.path.join(sysconfig.get_paths()["purelib"], "bitsandbytes") - cudnn_src = os.path.join(os.path.dirname(os.path.realpath(__file__)), "cudnn_windows") + cudnn_src = os.path.join(os.path.dirname(os.path.realpath(__file__)), "..\cudnn_windows") cudnn_dest = os.path.join(sysconfig.get_paths()["purelib"], "torch", "lib") print(f"Checking for CUDNN files in {cudnn_dest}") @@ -101,9 +101,4 @@ if os.name == "nt": shutil.copy2(src_file, cudnn_dest) print("Copied CUDNN 8.6 files to destination") - # diffusers_cmd = "git+https://github.com/huggingface/diffusers.git@8e74efa#egg=diffusers --force-reinstall" - # run(f'"{python}" -m pip install {diffusers_cmd}', "Installing particular diffusers commit", "Couldn't install diffusers") - # #install requirements file - # req_file = os.path.join(os.path.dirname(os.path.realpath(__file__)), "requirements.txt") - # run(f'"{python}" -m pip install -r "{req_file}"', "Updating requirements", "Couldn't install requirements") \ No newline at end of file diff --git a/train_db_fixed.py b/train_db.py similarity index 100% rename from train_db_fixed.py rename to train_db.py diff --git a/upgrade.bat b/upgrade.bat new file mode 100644 index 0000000..1db1b95 --- /dev/null +++ b/upgrade.bat @@ -0,0 +1,2 @@ +git pull +.\venv\Scripts\python.exe -m pip install -U -r .\requirements.txt \ No newline at end of file