Some changes to the tensorboard code and hypernetwork support
This commit is contained in:
parent
a6d593a6b5
commit
8f59129847
@ -4,6 +4,7 @@ import html
|
|||||||
import os
|
import os
|
||||||
import sys
|
import sys
|
||||||
import traceback
|
import traceback
|
||||||
|
import tensorboard
|
||||||
import tqdm
|
import tqdm
|
||||||
import csv
|
import csv
|
||||||
|
|
||||||
@ -18,7 +19,6 @@ import modules.textual_inversion.dataset
|
|||||||
from modules.textual_inversion import textual_inversion
|
from modules.textual_inversion import textual_inversion
|
||||||
from modules.textual_inversion.learn_schedule import LearnRateScheduler
|
from modules.textual_inversion.learn_schedule import LearnRateScheduler
|
||||||
|
|
||||||
|
|
||||||
class HypernetworkModule(torch.nn.Module):
|
class HypernetworkModule(torch.nn.Module):
|
||||||
multiplier = 1.0
|
multiplier = 1.0
|
||||||
|
|
||||||
@ -291,6 +291,9 @@ def train_hypernetwork(hypernetwork_name, learn_rate, batch_size, data_root, log
|
|||||||
scheduler = LearnRateScheduler(learn_rate, steps, ititial_step)
|
scheduler = LearnRateScheduler(learn_rate, steps, ititial_step)
|
||||||
optimizer = torch.optim.AdamW(weights, lr=scheduler.learn_rate)
|
optimizer = torch.optim.AdamW(weights, lr=scheduler.learn_rate)
|
||||||
|
|
||||||
|
if shared.opts.training_enable_tensorboard:
|
||||||
|
tensorboard_writer = textual_inversion.tensorboard_setup(log_directory)
|
||||||
|
|
||||||
pbar = tqdm.tqdm(enumerate(ds), total=steps - ititial_step)
|
pbar = tqdm.tqdm(enumerate(ds), total=steps - ititial_step)
|
||||||
for i, entries in pbar:
|
for i, entries in pbar:
|
||||||
hypernetwork.step = i + ititial_step
|
hypernetwork.step = i + ititial_step
|
||||||
@ -315,6 +318,7 @@ def train_hypernetwork(hypernetwork_name, learn_rate, batch_size, data_root, log
|
|||||||
optimizer.zero_grad()
|
optimizer.zero_grad()
|
||||||
loss.backward()
|
loss.backward()
|
||||||
optimizer.step()
|
optimizer.step()
|
||||||
|
|
||||||
mean_loss = losses.mean()
|
mean_loss = losses.mean()
|
||||||
if torch.isnan(mean_loss):
|
if torch.isnan(mean_loss):
|
||||||
raise RuntimeError("Loss diverged.")
|
raise RuntimeError("Loss diverged.")
|
||||||
@ -324,6 +328,14 @@ def train_hypernetwork(hypernetwork_name, learn_rate, batch_size, data_root, log
|
|||||||
last_saved_file = os.path.join(hypernetwork_dir, f'{hypernetwork_name}-{hypernetwork.step}.pt')
|
last_saved_file = os.path.join(hypernetwork_dir, f'{hypernetwork_name}-{hypernetwork.step}.pt')
|
||||||
hypernetwork.save(last_saved_file)
|
hypernetwork.save(last_saved_file)
|
||||||
|
|
||||||
|
if shared.opts.training_enable_tensorboard:
|
||||||
|
epoch_num = hypernetwork.step // len(ds)
|
||||||
|
epoch_step = hypernetwork.step - (epoch_num * len(ds)) + 1
|
||||||
|
|
||||||
|
textual_inversion.tensorboard_add(tensorboard_writer, loss=mean_loss,
|
||||||
|
global_step=hypernetwork.step, step=epoch_step,
|
||||||
|
learn_rate=scheduler.learn_rate, epoch_num=epoch_num)
|
||||||
|
|
||||||
textual_inversion.write_loss(log_directory, "hypernetwork_loss.csv", hypernetwork.step, len(ds), {
|
textual_inversion.write_loss(log_directory, "hypernetwork_loss.csv", hypernetwork.step, len(ds), {
|
||||||
"loss": f"{mean_loss:.7f}",
|
"loss": f"{mean_loss:.7f}",
|
||||||
"learn_rate": scheduler.learn_rate
|
"learn_rate": scheduler.learn_rate
|
||||||
@ -360,6 +372,10 @@ def train_hypernetwork(hypernetwork_name, learn_rate, batch_size, data_root, log
|
|||||||
processed = processing.process_images(p)
|
processed = processing.process_images(p)
|
||||||
image = processed.images[0] if len(processed.images)>0 else None
|
image = processed.images[0] if len(processed.images)>0 else None
|
||||||
|
|
||||||
|
if shared.opts.training_enable_tensorboard and shared.opts.training_tensorboard_save_images:
|
||||||
|
textual_inversion.tensorboard_add_image(tensorboard_writer, f"Validation at epoch {epoch_num}",
|
||||||
|
image, hypernetwork.step)
|
||||||
|
|
||||||
if unload:
|
if unload:
|
||||||
shared.sd_model.cond_stage_model.to(devices.cpu)
|
shared.sd_model.cond_stage_model.to(devices.cpu)
|
||||||
shared.sd_model.first_stage_model.to(devices.cpu)
|
shared.sd_model.first_stage_model.to(devices.cpu)
|
||||||
|
@ -201,16 +201,27 @@ def write_loss(log_directory, filename, step, epoch_len, values):
|
|||||||
**values,
|
**values,
|
||||||
})
|
})
|
||||||
|
|
||||||
|
def tensorboard_setup(log_directory):
|
||||||
|
os.makedirs(os.path.join(log_directory, "tensorboard"), exist_ok=True)
|
||||||
|
return SummaryWriter(
|
||||||
|
log_dir=os.path.join(log_directory, "tensorboard"),
|
||||||
|
flush_secs=shared.opts.training_tensorboard_flush_every)
|
||||||
|
|
||||||
|
def tensorboard_add(tensorboard_writer, loss, global_step, step, learn_rate, epoch_num):
|
||||||
|
tensorboard_add_scaler(tensorboard_writer, "Loss/train", loss, global_step)
|
||||||
|
tensorboard_add_scaler(tensorboard_writer, f"Loss/train/epoch-{epoch_num}", loss, step)
|
||||||
|
tensorboard_add_scaler(tensorboard_writer, "Learn rate/train", learn_rate, global_step)
|
||||||
|
tensorboard_add_scaler(tensorboard_writer, f"Learn rate/train/epoch-{epoch_num}", learn_rate, step)
|
||||||
|
|
||||||
def tensorboard_add_scaler(tensorboard_writer, tag, value, step):
|
def tensorboard_add_scaler(tensorboard_writer, tag, value, step):
|
||||||
if shared.opts.training_enable_tensorboard:
|
|
||||||
tensorboard_writer.add_scalar(tag=tag,
|
tensorboard_writer.add_scalar(tag=tag,
|
||||||
scalar_value=value, global_step=step)
|
scalar_value=value, global_step=step)
|
||||||
|
|
||||||
def tensorboard_add_image(tensorboard_writer, tag, pil_image, step):
|
def tensorboard_add_image(tensorboard_writer, tag, pil_image, step):
|
||||||
if shared.opts.training_enable_tensorboard:
|
|
||||||
# Convert a pil image to a torch tensor
|
# Convert a pil image to a torch tensor
|
||||||
img_tensor = torch.as_tensor(np.array(pil_image, copy=True))
|
img_tensor = torch.as_tensor(np.array(pil_image, copy=True))
|
||||||
img_tensor = img_tensor.view(pil_image.size[1], pil_image.size[0], len(pil_image.getbands()))
|
img_tensor = img_tensor.view(pil_image.size[1], pil_image.size[0],
|
||||||
|
len(pil_image.getbands()))
|
||||||
img_tensor = img_tensor.permute((2, 0, 1))
|
img_tensor = img_tensor.permute((2, 0, 1))
|
||||||
|
|
||||||
tensorboard_writer.add_image(tag, img_tensor, global_step=step)
|
tensorboard_writer.add_image(tag, img_tensor, global_step=step)
|
||||||
@ -268,10 +279,7 @@ def train_embedding(embedding_name, learn_rate, batch_size, data_root, log_direc
|
|||||||
optimizer = torch.optim.AdamW([embedding.vec], lr=scheduler.learn_rate)
|
optimizer = torch.optim.AdamW([embedding.vec], lr=scheduler.learn_rate)
|
||||||
|
|
||||||
if shared.opts.training_enable_tensorboard:
|
if shared.opts.training_enable_tensorboard:
|
||||||
os.makedirs(os.path.join(log_directory, "tensorboard"), exist_ok=True)
|
tensorboard_writer = tensorboard_setup(log_directory)
|
||||||
tensorboard_writer = SummaryWriter(
|
|
||||||
log_dir=os.path.join(log_directory, "tensorboard"),
|
|
||||||
flush_secs=shared.opts.training_tensorboard_flush_every)
|
|
||||||
|
|
||||||
pbar = tqdm.tqdm(enumerate(ds), total=steps-initial_step)
|
pbar = tqdm.tqdm(enumerate(ds), total=steps-initial_step)
|
||||||
for i, entries in pbar:
|
for i, entries in pbar:
|
||||||
@ -308,10 +316,8 @@ def train_embedding(embedding_name, learn_rate, batch_size, data_root, log_direc
|
|||||||
embedding_yet_to_be_embedded = True
|
embedding_yet_to_be_embedded = True
|
||||||
|
|
||||||
if shared.opts.training_enable_tensorboard:
|
if shared.opts.training_enable_tensorboard:
|
||||||
tensorboard_add_scaler(tensorboard_writer, "Loss/train", losses.mean(), embedding.step)
|
tensorboard_add(tensorboard_writer, loss=losses.mean(), global_step=embedding.step,
|
||||||
tensorboard_add_scaler(tensorboard_writer, f"Loss/train/epoch-{epoch_num}", losses.mean(), epoch_step)
|
step=epoch_step, learn_rate=scheduler.learn_rate, epoch_num=epoch_num)
|
||||||
tensorboard_add_scaler(tensorboard_writer, "Learn rate/train", scheduler.learn_rate, embedding.step)
|
|
||||||
tensorboard_add_scaler(tensorboard_writer, f"Learn rate/train/epoch-{epoch_num}", scheduler.learn_rate, epoch_step)
|
|
||||||
|
|
||||||
write_loss(log_directory, "textual_inversion_loss.csv", embedding.step, len(ds), {
|
write_loss(log_directory, "textual_inversion_loss.csv", embedding.step, len(ds), {
|
||||||
"loss": f"{losses.mean():.7f}",
|
"loss": f"{losses.mean():.7f}",
|
||||||
@ -377,7 +383,10 @@ def train_embedding(embedding_name, learn_rate, batch_size, data_root, log_direc
|
|||||||
embedding_yet_to_be_embedded = False
|
embedding_yet_to_be_embedded = False
|
||||||
|
|
||||||
image.save(last_saved_image)
|
image.save(last_saved_image)
|
||||||
tensorboard_add_image(tensorboard_writer, f"Validation at epoch {epoch_num}", image, embedding.step)
|
|
||||||
|
if shared.opts.training_enable_tensorboard and shared.opts.training_tensorboard_save_images:
|
||||||
|
tensorboard_add_image(tensorboard_writer, f"Validation at epoch {epoch_num}",
|
||||||
|
image, embedding.step)
|
||||||
|
|
||||||
last_saved_image += f", prompt: {preview_text}"
|
last_saved_image += f", prompt: {preview_text}"
|
||||||
|
|
||||||
|
Loading…
x
Reference in New Issue
Block a user