From 598f7fcd84f655dd204ad5e258dc1c41cc806cde Mon Sep 17 00:00:00 2001 From: aria1th <35677394+aria1th@users.noreply.github.com> Date: Mon, 16 Jan 2023 02:46:21 +0900 Subject: [PATCH 1/2] Fix loss_dict problem --- modules/hypernetworks/hypernetwork.py | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/modules/hypernetworks/hypernetwork.py b/modules/hypernetworks/hypernetwork.py index bbd1f673..438e3e9f 100644 --- a/modules/hypernetworks/hypernetwork.py +++ b/modules/hypernetworks/hypernetwork.py @@ -561,6 +561,7 @@ def train_hypernetwork(id_task, hypernetwork_name, learn_rate, batch_size, gradi _loss_step = 0 #internal # size = len(ds.indexes) # loss_dict = defaultdict(lambda : deque(maxlen = 1024)) + loss_logging = [] # losses = torch.zeros((size,)) # previous_mean_losses = [0] # previous_mean_loss = 0 @@ -601,6 +602,7 @@ def train_hypernetwork(id_task, hypernetwork_name, learn_rate, batch_size, gradi else: c = stack_conds(batch.cond).to(devices.device, non_blocking=pin_memory) loss = shared.sd_model(x, c)[0] / gradient_step + loss_logging.append(loss.item()) del x del c @@ -644,7 +646,7 @@ def train_hypernetwork(id_task, hypernetwork_name, learn_rate, batch_size, gradi if shared.opts.training_enable_tensorboard: epoch_num = hypernetwork.step // len(ds) epoch_step = hypernetwork.step - (epoch_num * len(ds)) + 1 - mean_loss = sum(sum(x) for x in loss_dict.values()) / sum(len(x) for x in loss_dict.values()) + mean_loss = sum(loss_logging) / len(loss_logging) textual_inversion.tensorboard_add(tensorboard_writer, loss=mean_loss, global_step=hypernetwork.step, step=epoch_step, learn_rate=scheduler.learn_rate, epoch_num=epoch_num) textual_inversion.write_loss(log_directory, "hypernetwork_loss.csv", hypernetwork.step, steps_per_epoch, { From 13445738d974edcca5ff2f4f8f3833c1f3433e5e Mon Sep 17 00:00:00 2001 From: aria1th <35677394+aria1th@users.noreply.github.com> Date: Mon, 16 Jan 2023 03:02:54 +0900 Subject: [PATCH 2/2] Fix tensorboard related functions --- modules/hypernetworks/hypernetwork.py | 13 ++++++------- 1 file changed, 6 insertions(+), 7 deletions(-) diff --git a/modules/hypernetworks/hypernetwork.py b/modules/hypernetworks/hypernetwork.py index 438e3e9f..c963fc40 100644 --- a/modules/hypernetworks/hypernetwork.py +++ b/modules/hypernetworks/hypernetwork.py @@ -561,7 +561,7 @@ def train_hypernetwork(id_task, hypernetwork_name, learn_rate, batch_size, gradi _loss_step = 0 #internal # size = len(ds.indexes) # loss_dict = defaultdict(lambda : deque(maxlen = 1024)) - loss_logging = [] + loss_logging = deque(maxlen=len(ds) * 3) # this should be configurable parameter, this is 3 * epoch(dataset size) # losses = torch.zeros((size,)) # previous_mean_losses = [0] # previous_mean_loss = 0 @@ -602,7 +602,6 @@ def train_hypernetwork(id_task, hypernetwork_name, learn_rate, batch_size, gradi else: c = stack_conds(batch.cond).to(devices.device, non_blocking=pin_memory) loss = shared.sd_model(x, c)[0] / gradient_step - loss_logging.append(loss.item()) del x del c @@ -612,7 +611,7 @@ def train_hypernetwork(id_task, hypernetwork_name, learn_rate, batch_size, gradi # go back until we reach gradient accumulation steps if (j + 1) % gradient_step != 0: continue - + loss_logging.append(_loss_step) if clip_grad: clip_grad(weights, clip_grad_sched.learn_rate) @@ -690,9 +689,6 @@ def train_hypernetwork(id_task, hypernetwork_name, learn_rate, batch_size, gradi processed = processing.process_images(p) image = processed.images[0] if len(processed.images) > 0 else None - - if shared.opts.training_enable_tensorboard and shared.opts.training_tensorboard_save_images: - textual_inversion.tensorboard_add_image(tensorboard_writer, f"Validation at epoch {epoch_num}", image, hypernetwork.step) if unload: shared.sd_model.cond_stage_model.to(devices.cpu) @@ -703,7 +699,10 @@ def train_hypernetwork(id_task, hypernetwork_name, learn_rate, batch_size, gradi hypernetwork.train() if image is not None: shared.state.assign_current_image(image) - + if shared.opts.training_enable_tensorboard and shared.opts.training_tensorboard_save_images: + textual_inversion.tensorboard_add_image(tensorboard_writer, + f"Validation at epoch {epoch_num}", image, + hypernetwork.step) last_saved_image, last_text_info = images.save_image(image, images_dir, "", p.seed, p.prompt, shared.opts.samples_format, processed.infotexts[0], p=p, forced_filename=forced_filename, save_to_dirs=False) last_saved_image += f", prompt: {preview_text}"