From 4f1d501bef8d2b26f7d92b9f519ac1a69921eea3 Mon Sep 17 00:00:00 2001 From: Divam Gupta Date: Fri, 28 Oct 2022 18:51:04 -0400 Subject: [PATCH] inpainting 1.5 model added in tf backend, options box refractored --- .../diffusionbee_backend.py | 96 +- backends/stable_diffusion_tf/downloader.py | 2 +- .../stable_diffusion_tf/constants.py | 1678 +++++++++++++++++ .../stable_diffusion_tf/stable_diffusion.py | 56 +- electron_app/src/components/Img2Img.vue | 89 +- electron_app/src/components/ImgGenerate.vue | 86 +- .../src/components_bare/SDOptionsDropdown.vue | 121 ++ 7 files changed, 1962 insertions(+), 166 deletions(-) create mode 100644 electron_app/src/components_bare/SDOptionsDropdown.vue diff --git a/backends/stable_diffusion_tf/diffusionbee_backend.py b/backends/stable_diffusion_tf/diffusionbee_backend.py index e2c17fcc..e6f5a0cc 100644 --- a/backends/stable_diffusion_tf/diffusionbee_backend.py +++ b/backends/stable_diffusion_tf/diffusionbee_backend.py @@ -91,20 +91,57 @@ def process_opt(d, generator): print("sdbk nwim %s"%(fpath) ) +cur_model_id = -1 +cur_model = None +def get_sd_model(model_id): + global p1 , p2 , p3 , p4 , p1_15 , p2_15 , p3_15 , p4_15 + global cur_model_id , cur_model + if cur_model_id != model_id: + + if cur_model is not None: + cur_model = None + time.sleep(1) + if model_id == 0: + + print("sdbk gnms Loading SD Model" ) + cur_model = StableDiffusion(img_height=512, img_width=512, jit_compile=False, download_weights=False, is_sd_15_inpaint=False) + cur_model.text_encoder .load_weights(p2) + cur_model.diffusion_model.load_weights(p1) + cur_model.decoder.load_weights(p3) + cur_model.encoder.load_weights(p4) + print("sdbk mdvr 1.4tf") + elif model_id == 1: + print("sdbk mdvr 1.5tf_inp") + print("sdbk gnms Loading SD Inpainting Model" ) + cur_model = StableDiffusion(img_height=512, img_width=512, jit_compile=False, download_weights=False, is_sd_15_inpaint=True) + cur_model.text_encoder .load_weights(p2_15) + cur_model.diffusion_model.load_weights(p1_15) + cur_model.decoder.load_weights(p3_15) + cur_model.encoder.load_weights(p4_15) + else: + assert False + + cur_model_id = model_id + + return cur_model + + def main(): + global p1 , p2 , p3 , p4 , p1_15 , p2_15 , p3_15 , p4_15 + print("sdbk mltl Loading Model") for _ in range(5): try: - p1 = ProgressBarDownloader(title="Downloading Model 1/4").download( + p1 = ProgressBarDownloader(title="Downloading Model 1/8").download( url="https://huggingface.co/fchollet/stable-diffusion/resolve/main/diffusion_model.h5", md5_checksum="72db3d55b60691e1f8a6a68cd9f47ad0", verify_ssl=False, extract_zip=False, ) - p2 = ProgressBarDownloader(title="Downloading Model 2/4").download( + p2 = ProgressBarDownloader(title="Downloading Model 2/8").download( url="https://huggingface.co/fchollet/stable-diffusion/resolve/main/text_encoder.h5", md5_checksum="9ea30bed7728473b4270a76aabf1836b", verify_ssl=False, @@ -112,19 +149,50 @@ def main(): ) - p3 = ProgressBarDownloader(title="Downloading Model 3/4").download( + p3 = ProgressBarDownloader(title="Downloading Model 3/8").download( url="https://huggingface.co/fchollet/stable-diffusion/resolve/main/decoder.h5", md5_checksum="8c86dc2fadfb0da9712a7a06cfa7bf11", verify_ssl=False, extract_zip=False, ) - p4 = ProgressBarDownloader(title="Downloading Model 4/4").download( + p4 = ProgressBarDownloader(title="Downloading Model 4/8").download( url="https://huggingface.co/divamgupta/stable-diffusion-tensorflow/resolve/main/encoder_newW.h5", md5_checksum="bef951ed69aa5a7a3acae0ab0308b630", verify_ssl=False, extract_zip=False, ) + + p1_15 = ProgressBarDownloader(title="Downloading Model 5/8").download( + url="https://huggingface.co/divamgupta/stable-diffusion-tensorflow/resolve/main/diffusion_model_15_inpaint.h5", + md5_checksum="fd5868208a33dc4594559433bc493334", + verify_ssl=False, + extract_zip=False, + ) + + p2_15 = ProgressBarDownloader(title="Downloading Model 6/8").download( + url="https://huggingface.co/divamgupta/stable-diffusion-tensorflow/resolve/main/text_encoder_15_inpaint.h5", + md5_checksum="859cc286026b9c1a510d87f85295b4a4", + verify_ssl=False, + extract_zip=False, + ) + + + p3_15 = ProgressBarDownloader(title="Downloading Model 7/8").download( + url="https://huggingface.co/divamgupta/stable-diffusion-tensorflow/resolve/main/decoder_15_inpaint.h5", + md5_checksum="aecfa5cbf18a06158e0dde99d6d2fadf", + verify_ssl=False, + extract_zip=False, + ) + + p4_15 = ProgressBarDownloader(title="Downloading Model 8/8").download( + url="https://huggingface.co/divamgupta/stable-diffusion-tensorflow/resolve/main/encoder_15_inpaint.h5", + md5_checksum="f73e95b6d5e1ed32e9a15fe31b1ede70", + verify_ssl=False, + extract_zip=False, + ) + + break except Exception as e: pass @@ -140,15 +208,12 @@ def main(): cur_size = (512 , 512) - generator = StableDiffusion(img_height=512, img_width=512, jit_compile=False, download_weights=False) - generator.text_encoder .load_weights(p2) - generator.diffusion_model.load_weights(p1) - generator.decoder.load_weights(p3) - generator.encoder.load_weights(p4) + generator = get_sd_model(0) + default_d = { "W" : 512 , "H" : 512, "num_imgs":1 , "ddim_steps" : 25 , "scale" : 7.5, "batch_size":1 , "input_image" : None, "img_strength": 0.5 - , "negative_prompt" : "" , "mask_image" : None,} + , "negative_prompt" : "" , "mask_image" : None, "model_id": 0 } print("sdbk mdld") @@ -169,15 +234,8 @@ def main(): d = copy.deepcopy(default_d) d.update(d_) print("sdbk inwk") # working on the input - - # if cur_size != (d['W'] , d['H']): - # print("sdbk mltl Loading Model") - # generator = StableDiffusion(img_height= d['H'], img_width=d['W'], jit_compile=False, download_weights=False) - # generator.text_encoder .load_weights(p2) - # generator.diffusion_model.load_weights(p1) - # generator.decoder.load_weights(p3) - # print("sdbk mdld") - # cur_size = (d['W'] , d['H']) + generator = None + generator = get_sd_model(d['model_id']) process_opt(d, generator) except Exception as e: diff --git a/backends/stable_diffusion_tf/downloader.py b/backends/stable_diffusion_tf/downloader.py index 835d2787..6ad011ae 100644 --- a/backends/stable_diffusion_tf/downloader.py +++ b/backends/stable_diffusion_tf/downloader.py @@ -116,7 +116,7 @@ def download(self, url, out_fname=None, md5_checksum=None, if time.time() - last_time > 0.1: last_time = time.time() print("sdbk mlpr %d"%int(done_percentage) ) # model loading percentage - print("sdbk mlms \"%s\""%("%.2fMB out of %.2fMB"%(dl/1000000 , total_length/1000000) )) + print("sdbk mlms %s"%("%.2fMB out of %.2fMB"%(dl/1000000 , total_length/1000000) )) print("sdbk mlpr %d"%int(-1) ) print("sdbk mltl Checking Model") diff --git a/backends/stable_diffusion_tf/stable_diffusion_tf/constants.py b/backends/stable_diffusion_tf/stable_diffusion_tf/constants.py index 512eabba..946bc729 100644 --- a/backends/stable_diffusion_tf/stable_diffusion_tf/constants.py +++ b/backends/stable_diffusion_tf/stable_diffusion_tf/constants.py @@ -1,3 +1,1681 @@ +PYTORCH_CKPT_MAPPING = {'text_encoder': [('cond_stage_model.transformer.text_model.embeddings.token_embedding.weight', + None), + ('cond_stage_model.transformer.text_model.embeddings.position_embedding.weight', + None), + ('cond_stage_model.transformer.text_model.encoder.layers.0.layer_norm1.weight', + None), + ('cond_stage_model.transformer.text_model.encoder.layers.0.layer_norm1.bias', + None), + ('cond_stage_model.transformer.text_model.encoder.layers.0.self_attn.q_proj.weight', + (1, 0)), + ('cond_stage_model.transformer.text_model.encoder.layers.0.self_attn.q_proj.bias', + None), + ('cond_stage_model.transformer.text_model.encoder.layers.0.self_attn.k_proj.weight', + (1, 0)), + ('cond_stage_model.transformer.text_model.encoder.layers.0.self_attn.k_proj.bias', + None), + ('cond_stage_model.transformer.text_model.encoder.layers.0.self_attn.v_proj.weight', + (1, 0)), + ('cond_stage_model.transformer.text_model.encoder.layers.0.self_attn.v_proj.bias', + None), + ('cond_stage_model.transformer.text_model.encoder.layers.0.self_attn.out_proj.weight', + (1, 0)), + ('cond_stage_model.transformer.text_model.encoder.layers.0.self_attn.out_proj.bias', + None), + ('cond_stage_model.transformer.text_model.encoder.layers.0.layer_norm2.weight', + None), + ('cond_stage_model.transformer.text_model.encoder.layers.0.layer_norm2.bias', + None), + ('cond_stage_model.transformer.text_model.encoder.layers.0.mlp.fc1.weight', + (1, 0)), + ('cond_stage_model.transformer.text_model.encoder.layers.0.mlp.fc1.bias', + None), + ('cond_stage_model.transformer.text_model.encoder.layers.0.mlp.fc2.weight', + (1, 0)), + ('cond_stage_model.transformer.text_model.encoder.layers.0.mlp.fc2.bias', + None), + ('cond_stage_model.transformer.text_model.encoder.layers.1.layer_norm1.weight', + None), + ('cond_stage_model.transformer.text_model.encoder.layers.1.layer_norm1.bias', + None), + ('cond_stage_model.transformer.text_model.encoder.layers.1.self_attn.q_proj.weight', + (1, 0)), + ('cond_stage_model.transformer.text_model.encoder.layers.1.self_attn.q_proj.bias', + None), + ('cond_stage_model.transformer.text_model.encoder.layers.1.self_attn.k_proj.weight', + (1, 0)), + ('cond_stage_model.transformer.text_model.encoder.layers.1.self_attn.k_proj.bias', + None), + ('cond_stage_model.transformer.text_model.encoder.layers.1.self_attn.v_proj.weight', + (1, 0)), + ('cond_stage_model.transformer.text_model.encoder.layers.1.self_attn.v_proj.bias', + None), + ('cond_stage_model.transformer.text_model.encoder.layers.1.self_attn.out_proj.weight', + (1, 0)), + ('cond_stage_model.transformer.text_model.encoder.layers.1.self_attn.out_proj.bias', + None), + ('cond_stage_model.transformer.text_model.encoder.layers.1.layer_norm2.weight', + None), + ('cond_stage_model.transformer.text_model.encoder.layers.1.layer_norm2.bias', + None), + ('cond_stage_model.transformer.text_model.encoder.layers.1.mlp.fc1.weight', + (1, 0)), + ('cond_stage_model.transformer.text_model.encoder.layers.1.mlp.fc1.bias', + None), + ('cond_stage_model.transformer.text_model.encoder.layers.1.mlp.fc2.weight', + (1, 0)), + ('cond_stage_model.transformer.text_model.encoder.layers.1.mlp.fc2.bias', + None), + ('cond_stage_model.transformer.text_model.encoder.layers.2.layer_norm1.weight', + None), + ('cond_stage_model.transformer.text_model.encoder.layers.2.layer_norm1.bias', + None), + ('cond_stage_model.transformer.text_model.encoder.layers.2.self_attn.q_proj.weight', + (1, 0)), + ('cond_stage_model.transformer.text_model.encoder.layers.2.self_attn.q_proj.bias', + None), + ('cond_stage_model.transformer.text_model.encoder.layers.2.self_attn.k_proj.weight', + (1, 0)), + ('cond_stage_model.transformer.text_model.encoder.layers.2.self_attn.k_proj.bias', + None), + ('cond_stage_model.transformer.text_model.encoder.layers.2.self_attn.v_proj.weight', + (1, 0)), + ('cond_stage_model.transformer.text_model.encoder.layers.2.self_attn.v_proj.bias', + None), + ('cond_stage_model.transformer.text_model.encoder.layers.2.self_attn.out_proj.weight', + (1, 0)), + ('cond_stage_model.transformer.text_model.encoder.layers.2.self_attn.out_proj.bias', + None), + ('cond_stage_model.transformer.text_model.encoder.layers.2.layer_norm2.weight', + None), + ('cond_stage_model.transformer.text_model.encoder.layers.2.layer_norm2.bias', + None), + ('cond_stage_model.transformer.text_model.encoder.layers.2.mlp.fc1.weight', + (1, 0)), + ('cond_stage_model.transformer.text_model.encoder.layers.2.mlp.fc1.bias', + None), + ('cond_stage_model.transformer.text_model.encoder.layers.2.mlp.fc2.weight', + (1, 0)), + ('cond_stage_model.transformer.text_model.encoder.layers.2.mlp.fc2.bias', + None), + ('cond_stage_model.transformer.text_model.encoder.layers.3.layer_norm1.weight', + None), + ('cond_stage_model.transformer.text_model.encoder.layers.3.layer_norm1.bias', + None), + ('cond_stage_model.transformer.text_model.encoder.layers.3.self_attn.q_proj.weight', + (1, 0)), + ('cond_stage_model.transformer.text_model.encoder.layers.3.self_attn.q_proj.bias', + None), + ('cond_stage_model.transformer.text_model.encoder.layers.3.self_attn.k_proj.weight', + (1, 0)), + ('cond_stage_model.transformer.text_model.encoder.layers.3.self_attn.k_proj.bias', + None), + ('cond_stage_model.transformer.text_model.encoder.layers.3.self_attn.v_proj.weight', + (1, 0)), + ('cond_stage_model.transformer.text_model.encoder.layers.3.self_attn.v_proj.bias', + None), + ('cond_stage_model.transformer.text_model.encoder.layers.3.self_attn.out_proj.weight', + (1, 0)), + ('cond_stage_model.transformer.text_model.encoder.layers.3.self_attn.out_proj.bias', + None), + ('cond_stage_model.transformer.text_model.encoder.layers.3.layer_norm2.weight', + None), + ('cond_stage_model.transformer.text_model.encoder.layers.3.layer_norm2.bias', + None), + ('cond_stage_model.transformer.text_model.encoder.layers.3.mlp.fc1.weight', + (1, 0)), + ('cond_stage_model.transformer.text_model.encoder.layers.3.mlp.fc1.bias', + None), + ('cond_stage_model.transformer.text_model.encoder.layers.3.mlp.fc2.weight', + (1, 0)), + ('cond_stage_model.transformer.text_model.encoder.layers.3.mlp.fc2.bias', + None), + ('cond_stage_model.transformer.text_model.encoder.layers.4.layer_norm1.weight', + None), + ('cond_stage_model.transformer.text_model.encoder.layers.4.layer_norm1.bias', + None), + ('cond_stage_model.transformer.text_model.encoder.layers.4.self_attn.q_proj.weight', + (1, 0)), + ('cond_stage_model.transformer.text_model.encoder.layers.4.self_attn.q_proj.bias', + None), + ('cond_stage_model.transformer.text_model.encoder.layers.4.self_attn.k_proj.weight', + (1, 0)), + ('cond_stage_model.transformer.text_model.encoder.layers.4.self_attn.k_proj.bias', + None), + ('cond_stage_model.transformer.text_model.encoder.layers.4.self_attn.v_proj.weight', + (1, 0)), + ('cond_stage_model.transformer.text_model.encoder.layers.4.self_attn.v_proj.bias', + None), + ('cond_stage_model.transformer.text_model.encoder.layers.4.self_attn.out_proj.weight', + (1, 0)), + ('cond_stage_model.transformer.text_model.encoder.layers.4.self_attn.out_proj.bias', + None), + ('cond_stage_model.transformer.text_model.encoder.layers.4.layer_norm2.weight', + None), + ('cond_stage_model.transformer.text_model.encoder.layers.4.layer_norm2.bias', + None), + ('cond_stage_model.transformer.text_model.encoder.layers.4.mlp.fc1.weight', + (1, 0)), + ('cond_stage_model.transformer.text_model.encoder.layers.4.mlp.fc1.bias', + None), + ('cond_stage_model.transformer.text_model.encoder.layers.4.mlp.fc2.weight', + (1, 0)), + ('cond_stage_model.transformer.text_model.encoder.layers.4.mlp.fc2.bias', + None), + ('cond_stage_model.transformer.text_model.encoder.layers.5.layer_norm1.weight', + None), + ('cond_stage_model.transformer.text_model.encoder.layers.5.layer_norm1.bias', + None), + ('cond_stage_model.transformer.text_model.encoder.layers.5.self_attn.q_proj.weight', + (1, 0)), + ('cond_stage_model.transformer.text_model.encoder.layers.5.self_attn.q_proj.bias', + None), + ('cond_stage_model.transformer.text_model.encoder.layers.5.self_attn.k_proj.weight', + (1, 0)), + ('cond_stage_model.transformer.text_model.encoder.layers.5.self_attn.k_proj.bias', + None), + ('cond_stage_model.transformer.text_model.encoder.layers.5.self_attn.v_proj.weight', + (1, 0)), + ('cond_stage_model.transformer.text_model.encoder.layers.5.self_attn.v_proj.bias', + None), + ('cond_stage_model.transformer.text_model.encoder.layers.5.self_attn.out_proj.weight', + (1, 0)), + ('cond_stage_model.transformer.text_model.encoder.layers.5.self_attn.out_proj.bias', + None), + ('cond_stage_model.transformer.text_model.encoder.layers.5.layer_norm2.weight', + None), + ('cond_stage_model.transformer.text_model.encoder.layers.5.layer_norm2.bias', + None), + ('cond_stage_model.transformer.text_model.encoder.layers.5.mlp.fc1.weight', + (1, 0)), + ('cond_stage_model.transformer.text_model.encoder.layers.5.mlp.fc1.bias', + None), + ('cond_stage_model.transformer.text_model.encoder.layers.5.mlp.fc2.weight', + (1, 0)), + ('cond_stage_model.transformer.text_model.encoder.layers.5.mlp.fc2.bias', + None), + ('cond_stage_model.transformer.text_model.encoder.layers.6.layer_norm1.weight', + None), + ('cond_stage_model.transformer.text_model.encoder.layers.6.layer_norm1.bias', + None), + ('cond_stage_model.transformer.text_model.encoder.layers.6.self_attn.q_proj.weight', + (1, 0)), + ('cond_stage_model.transformer.text_model.encoder.layers.6.self_attn.q_proj.bias', + None), + ('cond_stage_model.transformer.text_model.encoder.layers.6.self_attn.k_proj.weight', + (1, 0)), + ('cond_stage_model.transformer.text_model.encoder.layers.6.self_attn.k_proj.bias', + None), + ('cond_stage_model.transformer.text_model.encoder.layers.6.self_attn.v_proj.weight', + (1, 0)), + ('cond_stage_model.transformer.text_model.encoder.layers.6.self_attn.v_proj.bias', + None), + ('cond_stage_model.transformer.text_model.encoder.layers.6.self_attn.out_proj.weight', + (1, 0)), + ('cond_stage_model.transformer.text_model.encoder.layers.6.self_attn.out_proj.bias', + None), + ('cond_stage_model.transformer.text_model.encoder.layers.6.layer_norm2.weight', + None), + ('cond_stage_model.transformer.text_model.encoder.layers.6.layer_norm2.bias', + None), + ('cond_stage_model.transformer.text_model.encoder.layers.6.mlp.fc1.weight', + (1, 0)), + ('cond_stage_model.transformer.text_model.encoder.layers.6.mlp.fc1.bias', + None), + ('cond_stage_model.transformer.text_model.encoder.layers.6.mlp.fc2.weight', + (1, 0)), + ('cond_stage_model.transformer.text_model.encoder.layers.6.mlp.fc2.bias', + None), + ('cond_stage_model.transformer.text_model.encoder.layers.7.layer_norm1.weight', + None), + ('cond_stage_model.transformer.text_model.encoder.layers.7.layer_norm1.bias', + None), + ('cond_stage_model.transformer.text_model.encoder.layers.7.self_attn.q_proj.weight', + (1, 0)), + ('cond_stage_model.transformer.text_model.encoder.layers.7.self_attn.q_proj.bias', + None), + ('cond_stage_model.transformer.text_model.encoder.layers.7.self_attn.k_proj.weight', + (1, 0)), + ('cond_stage_model.transformer.text_model.encoder.layers.7.self_attn.k_proj.bias', + None), + ('cond_stage_model.transformer.text_model.encoder.layers.7.self_attn.v_proj.weight', + (1, 0)), + ('cond_stage_model.transformer.text_model.encoder.layers.7.self_attn.v_proj.bias', + None), + ('cond_stage_model.transformer.text_model.encoder.layers.7.self_attn.out_proj.weight', + (1, 0)), + ('cond_stage_model.transformer.text_model.encoder.layers.7.self_attn.out_proj.bias', + None), + ('cond_stage_model.transformer.text_model.encoder.layers.7.layer_norm2.weight', + None), + ('cond_stage_model.transformer.text_model.encoder.layers.7.layer_norm2.bias', + None), + ('cond_stage_model.transformer.text_model.encoder.layers.7.mlp.fc1.weight', + (1, 0)), + ('cond_stage_model.transformer.text_model.encoder.layers.7.mlp.fc1.bias', + None), + ('cond_stage_model.transformer.text_model.encoder.layers.7.mlp.fc2.weight', + (1, 0)), + ('cond_stage_model.transformer.text_model.encoder.layers.7.mlp.fc2.bias', + None), + ('cond_stage_model.transformer.text_model.encoder.layers.8.layer_norm1.weight', + None), + ('cond_stage_model.transformer.text_model.encoder.layers.8.layer_norm1.bias', + None), + ('cond_stage_model.transformer.text_model.encoder.layers.8.self_attn.q_proj.weight', + (1, 0)), + ('cond_stage_model.transformer.text_model.encoder.layers.8.self_attn.q_proj.bias', + None), + ('cond_stage_model.transformer.text_model.encoder.layers.8.self_attn.k_proj.weight', + (1, 0)), + ('cond_stage_model.transformer.text_model.encoder.layers.8.self_attn.k_proj.bias', + None), + ('cond_stage_model.transformer.text_model.encoder.layers.8.self_attn.v_proj.weight', + (1, 0)), + ('cond_stage_model.transformer.text_model.encoder.layers.8.self_attn.v_proj.bias', + None), + ('cond_stage_model.transformer.text_model.encoder.layers.8.self_attn.out_proj.weight', + (1, 0)), + ('cond_stage_model.transformer.text_model.encoder.layers.8.self_attn.out_proj.bias', + None), + ('cond_stage_model.transformer.text_model.encoder.layers.8.layer_norm2.weight', + None), + ('cond_stage_model.transformer.text_model.encoder.layers.8.layer_norm2.bias', + None), + ('cond_stage_model.transformer.text_model.encoder.layers.8.mlp.fc1.weight', + (1, 0)), + ('cond_stage_model.transformer.text_model.encoder.layers.8.mlp.fc1.bias', + None), + ('cond_stage_model.transformer.text_model.encoder.layers.8.mlp.fc2.weight', + (1, 0)), + ('cond_stage_model.transformer.text_model.encoder.layers.8.mlp.fc2.bias', + None), + ('cond_stage_model.transformer.text_model.encoder.layers.9.layer_norm1.weight', + None), + ('cond_stage_model.transformer.text_model.encoder.layers.9.layer_norm1.bias', + None), + ('cond_stage_model.transformer.text_model.encoder.layers.9.self_attn.q_proj.weight', + (1, 0)), + ('cond_stage_model.transformer.text_model.encoder.layers.9.self_attn.q_proj.bias', + None), + ('cond_stage_model.transformer.text_model.encoder.layers.9.self_attn.k_proj.weight', + (1, 0)), + ('cond_stage_model.transformer.text_model.encoder.layers.9.self_attn.k_proj.bias', + None), + ('cond_stage_model.transformer.text_model.encoder.layers.9.self_attn.v_proj.weight', + (1, 0)), + ('cond_stage_model.transformer.text_model.encoder.layers.9.self_attn.v_proj.bias', + None), + ('cond_stage_model.transformer.text_model.encoder.layers.9.self_attn.out_proj.weight', + (1, 0)), + ('cond_stage_model.transformer.text_model.encoder.layers.9.self_attn.out_proj.bias', + None), + ('cond_stage_model.transformer.text_model.encoder.layers.9.layer_norm2.weight', + None), + ('cond_stage_model.transformer.text_model.encoder.layers.9.layer_norm2.bias', + None), + ('cond_stage_model.transformer.text_model.encoder.layers.9.mlp.fc1.weight', + (1, 0)), + ('cond_stage_model.transformer.text_model.encoder.layers.9.mlp.fc1.bias', + None), + ('cond_stage_model.transformer.text_model.encoder.layers.9.mlp.fc2.weight', + (1, 0)), + ('cond_stage_model.transformer.text_model.encoder.layers.9.mlp.fc2.bias', + None), + ('cond_stage_model.transformer.text_model.encoder.layers.10.layer_norm1.weight', + None), + ('cond_stage_model.transformer.text_model.encoder.layers.10.layer_norm1.bias', + None), + ('cond_stage_model.transformer.text_model.encoder.layers.10.self_attn.q_proj.weight', + (1, 0)), + ('cond_stage_model.transformer.text_model.encoder.layers.10.self_attn.q_proj.bias', + None), + ('cond_stage_model.transformer.text_model.encoder.layers.10.self_attn.k_proj.weight', + (1, 0)), + ('cond_stage_model.transformer.text_model.encoder.layers.10.self_attn.k_proj.bias', + None), + ('cond_stage_model.transformer.text_model.encoder.layers.10.self_attn.v_proj.weight', + (1, 0)), + ('cond_stage_model.transformer.text_model.encoder.layers.10.self_attn.v_proj.bias', + None), + ('cond_stage_model.transformer.text_model.encoder.layers.10.self_attn.out_proj.weight', + (1, 0)), + ('cond_stage_model.transformer.text_model.encoder.layers.10.self_attn.out_proj.bias', + None), + ('cond_stage_model.transformer.text_model.encoder.layers.10.layer_norm2.weight', + None), + ('cond_stage_model.transformer.text_model.encoder.layers.10.layer_norm2.bias', + None), + ('cond_stage_model.transformer.text_model.encoder.layers.10.mlp.fc1.weight', + (1, 0)), + ('cond_stage_model.transformer.text_model.encoder.layers.10.mlp.fc1.bias', + None), + ('cond_stage_model.transformer.text_model.encoder.layers.10.mlp.fc2.weight', + (1, 0)), + ('cond_stage_model.transformer.text_model.encoder.layers.10.mlp.fc2.bias', + None), + ('cond_stage_model.transformer.text_model.encoder.layers.11.layer_norm1.weight', + None), + ('cond_stage_model.transformer.text_model.encoder.layers.11.layer_norm1.bias', + None), + ('cond_stage_model.transformer.text_model.encoder.layers.11.self_attn.q_proj.weight', + (1, 0)), + ('cond_stage_model.transformer.text_model.encoder.layers.11.self_attn.q_proj.bias', + None), + ('cond_stage_model.transformer.text_model.encoder.layers.11.self_attn.k_proj.weight', + (1, 0)), + ('cond_stage_model.transformer.text_model.encoder.layers.11.self_attn.k_proj.bias', + None), + ('cond_stage_model.transformer.text_model.encoder.layers.11.self_attn.v_proj.weight', + (1, 0)), + ('cond_stage_model.transformer.text_model.encoder.layers.11.self_attn.v_proj.bias', + None), + ('cond_stage_model.transformer.text_model.encoder.layers.11.self_attn.out_proj.weight', + (1, 0)), + ('cond_stage_model.transformer.text_model.encoder.layers.11.self_attn.out_proj.bias', + None), + ('cond_stage_model.transformer.text_model.encoder.layers.11.layer_norm2.weight', + None), + ('cond_stage_model.transformer.text_model.encoder.layers.11.layer_norm2.bias', + None), + ('cond_stage_model.transformer.text_model.encoder.layers.11.mlp.fc1.weight', + (1, 0)), + ('cond_stage_model.transformer.text_model.encoder.layers.11.mlp.fc1.bias', + None), + ('cond_stage_model.transformer.text_model.encoder.layers.11.mlp.fc2.weight', + (1, 0)), + ('cond_stage_model.transformer.text_model.encoder.layers.11.mlp.fc2.bias', + None), + ('cond_stage_model.transformer.text_model.final_layer_norm.weight', None), + ('cond_stage_model.transformer.text_model.final_layer_norm.bias', None)], + 'diffusion_model': [('model.diffusion_model.time_embed.0.weight', (1, 0)), + ('model.diffusion_model.time_embed.0.bias', None), + ('model.diffusion_model.time_embed.2.weight', (1, 0)), + ('model.diffusion_model.time_embed.2.bias', None), + ('model.diffusion_model.input_blocks.0.0.weight', (2, 3, 1, 0)), + ('model.diffusion_model.input_blocks.0.0.bias', None), + ('model.diffusion_model.input_blocks.1.0.in_layers.0.weight', None), + ('model.diffusion_model.input_blocks.1.0.in_layers.0.bias', None), + ('model.diffusion_model.input_blocks.1.0.in_layers.2.weight', (2, 3, 1, 0)), + ('model.diffusion_model.input_blocks.1.0.in_layers.2.bias', None), + ('model.diffusion_model.input_blocks.1.0.emb_layers.1.weight', (1, 0)), + ('model.diffusion_model.input_blocks.1.0.emb_layers.1.bias', None), + ('model.diffusion_model.input_blocks.1.0.out_layers.0.weight', None), + ('model.diffusion_model.input_blocks.1.0.out_layers.0.bias', None), + ('model.diffusion_model.input_blocks.1.0.out_layers.3.weight', (2, 3, 1, 0)), + ('model.diffusion_model.input_blocks.1.0.out_layers.3.bias', None), + ('model.diffusion_model.input_blocks.1.1.norm.weight', None), + ('model.diffusion_model.input_blocks.1.1.norm.bias', None), + ('model.diffusion_model.input_blocks.1.1.proj_in.weight', (2, 3, 1, 0)), + ('model.diffusion_model.input_blocks.1.1.proj_in.bias', None), + ('model.diffusion_model.input_blocks.1.1.transformer_blocks.0.norm1.weight', + None), + ('model.diffusion_model.input_blocks.1.1.transformer_blocks.0.norm1.bias', + None), + ('model.diffusion_model.input_blocks.1.1.transformer_blocks.0.attn1.to_q.weight', + (1, 0)), + ('model.diffusion_model.input_blocks.1.1.transformer_blocks.0.attn1.to_k.weight', + (1, 0)), + ('model.diffusion_model.input_blocks.1.1.transformer_blocks.0.attn1.to_v.weight', + (1, 0)), + ('model.diffusion_model.input_blocks.1.1.transformer_blocks.0.attn1.to_out.0.weight', + (1, 0)), + ('model.diffusion_model.input_blocks.1.1.transformer_blocks.0.attn1.to_out.0.bias', + None), + ('model.diffusion_model.input_blocks.1.1.transformer_blocks.0.norm2.weight', + None), + ('model.diffusion_model.input_blocks.1.1.transformer_blocks.0.norm2.bias', + None), + ('model.diffusion_model.input_blocks.1.1.transformer_blocks.0.attn2.to_q.weight', + (1, 0)), + ('model.diffusion_model.input_blocks.1.1.transformer_blocks.0.attn2.to_k.weight', + (1, 0)), + ('model.diffusion_model.input_blocks.1.1.transformer_blocks.0.attn2.to_v.weight', + (1, 0)), + ('model.diffusion_model.input_blocks.1.1.transformer_blocks.0.attn2.to_out.0.weight', + (1, 0)), + ('model.diffusion_model.input_blocks.1.1.transformer_blocks.0.attn2.to_out.0.bias', + None), + ('model.diffusion_model.input_blocks.1.1.transformer_blocks.0.norm3.weight', + None), + ('model.diffusion_model.input_blocks.1.1.transformer_blocks.0.norm3.bias', + None), + ('model.diffusion_model.input_blocks.1.1.transformer_blocks.0.ff.net.0.proj.weight', + (1, 0)), + ('model.diffusion_model.input_blocks.1.1.transformer_blocks.0.ff.net.0.proj.bias', + None), + ('model.diffusion_model.input_blocks.1.1.transformer_blocks.0.ff.net.2.weight', + (1, 0)), + ('model.diffusion_model.input_blocks.1.1.transformer_blocks.0.ff.net.2.bias', + None), + ('model.diffusion_model.input_blocks.1.1.proj_out.weight', (2, 3, 1, 0)), + ('model.diffusion_model.input_blocks.1.1.proj_out.bias', None), + ('model.diffusion_model.input_blocks.2.0.in_layers.0.weight', None), + ('model.diffusion_model.input_blocks.2.0.in_layers.0.bias', None), + ('model.diffusion_model.input_blocks.2.0.in_layers.2.weight', (2, 3, 1, 0)), + ('model.diffusion_model.input_blocks.2.0.in_layers.2.bias', None), + ('model.diffusion_model.input_blocks.2.0.emb_layers.1.weight', (1, 0)), + ('model.diffusion_model.input_blocks.2.0.emb_layers.1.bias', None), + ('model.diffusion_model.input_blocks.2.0.out_layers.0.weight', None), + ('model.diffusion_model.input_blocks.2.0.out_layers.0.bias', None), + ('model.diffusion_model.input_blocks.2.0.out_layers.3.weight', (2, 3, 1, 0)), + ('model.diffusion_model.input_blocks.2.0.out_layers.3.bias', None), + ('model.diffusion_model.input_blocks.2.1.norm.weight', None), + ('model.diffusion_model.input_blocks.2.1.norm.bias', None), + ('model.diffusion_model.input_blocks.2.1.proj_in.weight', (2, 3, 1, 0)), + ('model.diffusion_model.input_blocks.2.1.proj_in.bias', None), + ('model.diffusion_model.input_blocks.2.1.transformer_blocks.0.norm1.weight', + None), + ('model.diffusion_model.input_blocks.2.1.transformer_blocks.0.norm1.bias', + None), + ('model.diffusion_model.input_blocks.2.1.transformer_blocks.0.attn1.to_q.weight', + (1, 0)), + ('model.diffusion_model.input_blocks.2.1.transformer_blocks.0.attn1.to_k.weight', + (1, 0)), + ('model.diffusion_model.input_blocks.2.1.transformer_blocks.0.attn1.to_v.weight', + (1, 0)), + ('model.diffusion_model.input_blocks.2.1.transformer_blocks.0.attn1.to_out.0.weight', + (1, 0)), + ('model.diffusion_model.input_blocks.2.1.transformer_blocks.0.attn1.to_out.0.bias', + None), + ('model.diffusion_model.input_blocks.2.1.transformer_blocks.0.norm2.weight', + None), + ('model.diffusion_model.input_blocks.2.1.transformer_blocks.0.norm2.bias', + None), + ('model.diffusion_model.input_blocks.2.1.transformer_blocks.0.attn2.to_q.weight', + (1, 0)), + ('model.diffusion_model.input_blocks.2.1.transformer_blocks.0.attn2.to_k.weight', + (1, 0)), + ('model.diffusion_model.input_blocks.2.1.transformer_blocks.0.attn2.to_v.weight', + (1, 0)), + ('model.diffusion_model.input_blocks.2.1.transformer_blocks.0.attn2.to_out.0.weight', + (1, 0)), + ('model.diffusion_model.input_blocks.2.1.transformer_blocks.0.attn2.to_out.0.bias', + None), + ('model.diffusion_model.input_blocks.2.1.transformer_blocks.0.norm3.weight', + None), + ('model.diffusion_model.input_blocks.2.1.transformer_blocks.0.norm3.bias', + None), + ('model.diffusion_model.input_blocks.2.1.transformer_blocks.0.ff.net.0.proj.weight', + (1, 0)), + ('model.diffusion_model.input_blocks.2.1.transformer_blocks.0.ff.net.0.proj.bias', + None), + ('model.diffusion_model.input_blocks.2.1.transformer_blocks.0.ff.net.2.weight', + (1, 0)), + ('model.diffusion_model.input_blocks.2.1.transformer_blocks.0.ff.net.2.bias', + None), + ('model.diffusion_model.input_blocks.2.1.proj_out.weight', (2, 3, 1, 0)), + ('model.diffusion_model.input_blocks.2.1.proj_out.bias', None), + ('model.diffusion_model.input_blocks.3.0.op.weight', (2, 3, 1, 0)), + ('model.diffusion_model.input_blocks.3.0.op.bias', None), + ('model.diffusion_model.input_blocks.4.0.in_layers.0.weight', None), + ('model.diffusion_model.input_blocks.4.0.in_layers.0.bias', None), + ('model.diffusion_model.input_blocks.4.0.in_layers.2.weight', (2, 3, 1, 0)), + ('model.diffusion_model.input_blocks.4.0.in_layers.2.bias', None), + ('model.diffusion_model.input_blocks.4.0.emb_layers.1.weight', (1, 0)), + ('model.diffusion_model.input_blocks.4.0.emb_layers.1.bias', None), + ('model.diffusion_model.input_blocks.4.0.out_layers.0.weight', None), + ('model.diffusion_model.input_blocks.4.0.out_layers.0.bias', None), + ('model.diffusion_model.input_blocks.4.0.out_layers.3.weight', (2, 3, 1, 0)), + ('model.diffusion_model.input_blocks.4.0.out_layers.3.bias', None), + ('model.diffusion_model.input_blocks.4.0.skip_connection.weight', + (2, 3, 1, 0)), + ('model.diffusion_model.input_blocks.4.0.skip_connection.bias', None), + ('model.diffusion_model.input_blocks.4.1.norm.weight', None), + ('model.diffusion_model.input_blocks.4.1.norm.bias', None), + ('model.diffusion_model.input_blocks.4.1.proj_in.weight', (2, 3, 1, 0)), + ('model.diffusion_model.input_blocks.4.1.proj_in.bias', None), + ('model.diffusion_model.input_blocks.4.1.transformer_blocks.0.norm1.weight', + None), + ('model.diffusion_model.input_blocks.4.1.transformer_blocks.0.norm1.bias', + None), + ('model.diffusion_model.input_blocks.4.1.transformer_blocks.0.attn1.to_q.weight', + (1, 0)), + ('model.diffusion_model.input_blocks.4.1.transformer_blocks.0.attn1.to_k.weight', + (1, 0)), + ('model.diffusion_model.input_blocks.4.1.transformer_blocks.0.attn1.to_v.weight', + (1, 0)), + ('model.diffusion_model.input_blocks.4.1.transformer_blocks.0.attn1.to_out.0.weight', + (1, 0)), + ('model.diffusion_model.input_blocks.4.1.transformer_blocks.0.attn1.to_out.0.bias', + None), + ('model.diffusion_model.input_blocks.4.1.transformer_blocks.0.norm2.weight', + None), + ('model.diffusion_model.input_blocks.4.1.transformer_blocks.0.norm2.bias', + None), + ('model.diffusion_model.input_blocks.4.1.transformer_blocks.0.attn2.to_q.weight', + (1, 0)), + ('model.diffusion_model.input_blocks.4.1.transformer_blocks.0.attn2.to_k.weight', + (1, 0)), + ('model.diffusion_model.input_blocks.4.1.transformer_blocks.0.attn2.to_v.weight', + (1, 0)), + ('model.diffusion_model.input_blocks.4.1.transformer_blocks.0.attn2.to_out.0.weight', + (1, 0)), + ('model.diffusion_model.input_blocks.4.1.transformer_blocks.0.attn2.to_out.0.bias', + None), + ('model.diffusion_model.input_blocks.4.1.transformer_blocks.0.norm3.weight', + None), + ('model.diffusion_model.input_blocks.4.1.transformer_blocks.0.norm3.bias', + None), + ('model.diffusion_model.input_blocks.4.1.transformer_blocks.0.ff.net.0.proj.weight', + (1, 0)), + ('model.diffusion_model.input_blocks.4.1.transformer_blocks.0.ff.net.0.proj.bias', + None), + ('model.diffusion_model.input_blocks.4.1.transformer_blocks.0.ff.net.2.weight', + (1, 0)), + ('model.diffusion_model.input_blocks.4.1.transformer_blocks.0.ff.net.2.bias', + None), + ('model.diffusion_model.input_blocks.4.1.proj_out.weight', (2, 3, 1, 0)), + ('model.diffusion_model.input_blocks.4.1.proj_out.bias', None), + ('model.diffusion_model.input_blocks.5.0.in_layers.0.weight', None), + ('model.diffusion_model.input_blocks.5.0.in_layers.0.bias', None), + ('model.diffusion_model.input_blocks.5.0.in_layers.2.weight', (2, 3, 1, 0)), + ('model.diffusion_model.input_blocks.5.0.in_layers.2.bias', None), + ('model.diffusion_model.input_blocks.5.0.emb_layers.1.weight', (1, 0)), + ('model.diffusion_model.input_blocks.5.0.emb_layers.1.bias', None), + ('model.diffusion_model.input_blocks.5.0.out_layers.0.weight', None), + ('model.diffusion_model.input_blocks.5.0.out_layers.0.bias', None), + ('model.diffusion_model.input_blocks.5.0.out_layers.3.weight', (2, 3, 1, 0)), + ('model.diffusion_model.input_blocks.5.0.out_layers.3.bias', None), + ('model.diffusion_model.input_blocks.5.1.norm.weight', None), + ('model.diffusion_model.input_blocks.5.1.norm.bias', None), + ('model.diffusion_model.input_blocks.5.1.proj_in.weight', (2, 3, 1, 0)), + ('model.diffusion_model.input_blocks.5.1.proj_in.bias', None), + ('model.diffusion_model.input_blocks.5.1.transformer_blocks.0.norm1.weight', + None), + ('model.diffusion_model.input_blocks.5.1.transformer_blocks.0.norm1.bias', + None), + ('model.diffusion_model.input_blocks.5.1.transformer_blocks.0.attn1.to_q.weight', + (1, 0)), + ('model.diffusion_model.input_blocks.5.1.transformer_blocks.0.attn1.to_k.weight', + (1, 0)), + ('model.diffusion_model.input_blocks.5.1.transformer_blocks.0.attn1.to_v.weight', + (1, 0)), + ('model.diffusion_model.input_blocks.5.1.transformer_blocks.0.attn1.to_out.0.weight', + (1, 0)), + ('model.diffusion_model.input_blocks.5.1.transformer_blocks.0.attn1.to_out.0.bias', + None), + ('model.diffusion_model.input_blocks.5.1.transformer_blocks.0.norm2.weight', + None), + ('model.diffusion_model.input_blocks.5.1.transformer_blocks.0.norm2.bias', + None), + ('model.diffusion_model.input_blocks.5.1.transformer_blocks.0.attn2.to_q.weight', + (1, 0)), + ('model.diffusion_model.input_blocks.5.1.transformer_blocks.0.attn2.to_k.weight', + (1, 0)), + ('model.diffusion_model.input_blocks.5.1.transformer_blocks.0.attn2.to_v.weight', + (1, 0)), + ('model.diffusion_model.input_blocks.5.1.transformer_blocks.0.attn2.to_out.0.weight', + (1, 0)), + ('model.diffusion_model.input_blocks.5.1.transformer_blocks.0.attn2.to_out.0.bias', + None), + ('model.diffusion_model.input_blocks.5.1.transformer_blocks.0.norm3.weight', + None), + ('model.diffusion_model.input_blocks.5.1.transformer_blocks.0.norm3.bias', + None), + ('model.diffusion_model.input_blocks.5.1.transformer_blocks.0.ff.net.0.proj.weight', + (1, 0)), + ('model.diffusion_model.input_blocks.5.1.transformer_blocks.0.ff.net.0.proj.bias', + None), + ('model.diffusion_model.input_blocks.5.1.transformer_blocks.0.ff.net.2.weight', + (1, 0)), + ('model.diffusion_model.input_blocks.5.1.transformer_blocks.0.ff.net.2.bias', + None), + ('model.diffusion_model.input_blocks.5.1.proj_out.weight', (2, 3, 1, 0)), + ('model.diffusion_model.input_blocks.5.1.proj_out.bias', None), + ('model.diffusion_model.input_blocks.6.0.op.weight', (2, 3, 1, 0)), + ('model.diffusion_model.input_blocks.6.0.op.bias', None), + ('model.diffusion_model.input_blocks.7.0.in_layers.0.weight', None), + ('model.diffusion_model.input_blocks.7.0.in_layers.0.bias', None), + ('model.diffusion_model.input_blocks.7.0.in_layers.2.weight', (2, 3, 1, 0)), + ('model.diffusion_model.input_blocks.7.0.in_layers.2.bias', None), + ('model.diffusion_model.input_blocks.7.0.emb_layers.1.weight', (1, 0)), + ('model.diffusion_model.input_blocks.7.0.emb_layers.1.bias', None), + ('model.diffusion_model.input_blocks.7.0.out_layers.0.weight', None), + ('model.diffusion_model.input_blocks.7.0.out_layers.0.bias', None), + ('model.diffusion_model.input_blocks.7.0.out_layers.3.weight', (2, 3, 1, 0)), + ('model.diffusion_model.input_blocks.7.0.out_layers.3.bias', None), + ('model.diffusion_model.input_blocks.7.0.skip_connection.weight', + (2, 3, 1, 0)), + ('model.diffusion_model.input_blocks.7.0.skip_connection.bias', None), + ('model.diffusion_model.input_blocks.7.1.norm.weight', None), + ('model.diffusion_model.input_blocks.7.1.norm.bias', None), + ('model.diffusion_model.input_blocks.7.1.proj_in.weight', (2, 3, 1, 0)), + ('model.diffusion_model.input_blocks.7.1.proj_in.bias', None), + ('model.diffusion_model.input_blocks.7.1.transformer_blocks.0.norm1.weight', + None), + ('model.diffusion_model.input_blocks.7.1.transformer_blocks.0.norm1.bias', + None), + ('model.diffusion_model.input_blocks.7.1.transformer_blocks.0.attn1.to_q.weight', + (1, 0)), + ('model.diffusion_model.input_blocks.7.1.transformer_blocks.0.attn1.to_k.weight', + (1, 0)), + ('model.diffusion_model.input_blocks.7.1.transformer_blocks.0.attn1.to_v.weight', + (1, 0)), + ('model.diffusion_model.input_blocks.7.1.transformer_blocks.0.attn1.to_out.0.weight', + (1, 0)), + ('model.diffusion_model.input_blocks.7.1.transformer_blocks.0.attn1.to_out.0.bias', + None), + ('model.diffusion_model.input_blocks.7.1.transformer_blocks.0.norm2.weight', + None), + ('model.diffusion_model.input_blocks.7.1.transformer_blocks.0.norm2.bias', + None), + ('model.diffusion_model.input_blocks.7.1.transformer_blocks.0.attn2.to_q.weight', + (1, 0)), + ('model.diffusion_model.input_blocks.7.1.transformer_blocks.0.attn2.to_k.weight', + (1, 0)), + ('model.diffusion_model.input_blocks.7.1.transformer_blocks.0.attn2.to_v.weight', + (1, 0)), + ('model.diffusion_model.input_blocks.7.1.transformer_blocks.0.attn2.to_out.0.weight', + (1, 0)), + ('model.diffusion_model.input_blocks.7.1.transformer_blocks.0.attn2.to_out.0.bias', + None), + ('model.diffusion_model.input_blocks.7.1.transformer_blocks.0.norm3.weight', + None), + ('model.diffusion_model.input_blocks.7.1.transformer_blocks.0.norm3.bias', + None), + ('model.diffusion_model.input_blocks.7.1.transformer_blocks.0.ff.net.0.proj.weight', + (1, 0)), + ('model.diffusion_model.input_blocks.7.1.transformer_blocks.0.ff.net.0.proj.bias', + None), + ('model.diffusion_model.input_blocks.7.1.transformer_blocks.0.ff.net.2.weight', + (1, 0)), + ('model.diffusion_model.input_blocks.7.1.transformer_blocks.0.ff.net.2.bias', + None), + ('model.diffusion_model.input_blocks.7.1.proj_out.weight', (2, 3, 1, 0)), + ('model.diffusion_model.input_blocks.7.1.proj_out.bias', None), + ('model.diffusion_model.input_blocks.8.0.in_layers.0.weight', None), + ('model.diffusion_model.input_blocks.8.0.in_layers.0.bias', None), + ('model.diffusion_model.input_blocks.8.0.in_layers.2.weight', (2, 3, 1, 0)), + ('model.diffusion_model.input_blocks.8.0.in_layers.2.bias', None), + ('model.diffusion_model.input_blocks.8.0.emb_layers.1.weight', (1, 0)), + ('model.diffusion_model.input_blocks.8.0.emb_layers.1.bias', None), + ('model.diffusion_model.input_blocks.8.0.out_layers.0.weight', None), + ('model.diffusion_model.input_blocks.8.0.out_layers.0.bias', None), + ('model.diffusion_model.input_blocks.8.0.out_layers.3.weight', (2, 3, 1, 0)), + ('model.diffusion_model.input_blocks.8.0.out_layers.3.bias', None), + ('model.diffusion_model.input_blocks.8.1.norm.weight', None), + ('model.diffusion_model.input_blocks.8.1.norm.bias', None), + ('model.diffusion_model.input_blocks.8.1.proj_in.weight', (2, 3, 1, 0)), + ('model.diffusion_model.input_blocks.8.1.proj_in.bias', None), + ('model.diffusion_model.input_blocks.8.1.transformer_blocks.0.norm1.weight', + None), + ('model.diffusion_model.input_blocks.8.1.transformer_blocks.0.norm1.bias', + None), + ('model.diffusion_model.input_blocks.8.1.transformer_blocks.0.attn1.to_q.weight', + (1, 0)), + ('model.diffusion_model.input_blocks.8.1.transformer_blocks.0.attn1.to_k.weight', + (1, 0)), + ('model.diffusion_model.input_blocks.8.1.transformer_blocks.0.attn1.to_v.weight', + (1, 0)), + ('model.diffusion_model.input_blocks.8.1.transformer_blocks.0.attn1.to_out.0.weight', + (1, 0)), + ('model.diffusion_model.input_blocks.8.1.transformer_blocks.0.attn1.to_out.0.bias', + None), + ('model.diffusion_model.input_blocks.8.1.transformer_blocks.0.norm2.weight', + None), + ('model.diffusion_model.input_blocks.8.1.transformer_blocks.0.norm2.bias', + None), + ('model.diffusion_model.input_blocks.8.1.transformer_blocks.0.attn2.to_q.weight', + (1, 0)), + ('model.diffusion_model.input_blocks.8.1.transformer_blocks.0.attn2.to_k.weight', + (1, 0)), + ('model.diffusion_model.input_blocks.8.1.transformer_blocks.0.attn2.to_v.weight', + (1, 0)), + ('model.diffusion_model.input_blocks.8.1.transformer_blocks.0.attn2.to_out.0.weight', + (1, 0)), + ('model.diffusion_model.input_blocks.8.1.transformer_blocks.0.attn2.to_out.0.bias', + None), + ('model.diffusion_model.input_blocks.8.1.transformer_blocks.0.norm3.weight', + None), + ('model.diffusion_model.input_blocks.8.1.transformer_blocks.0.norm3.bias', + None), + ('model.diffusion_model.input_blocks.8.1.transformer_blocks.0.ff.net.0.proj.weight', + (1, 0)), + ('model.diffusion_model.input_blocks.8.1.transformer_blocks.0.ff.net.0.proj.bias', + None), + ('model.diffusion_model.input_blocks.8.1.transformer_blocks.0.ff.net.2.weight', + (1, 0)), + ('model.diffusion_model.input_blocks.8.1.transformer_blocks.0.ff.net.2.bias', + None), + ('model.diffusion_model.input_blocks.8.1.proj_out.weight', (2, 3, 1, 0)), + ('model.diffusion_model.input_blocks.8.1.proj_out.bias', None), + ('model.diffusion_model.input_blocks.9.0.op.weight', (2, 3, 1, 0)), + ('model.diffusion_model.input_blocks.9.0.op.bias', None), + ('model.diffusion_model.input_blocks.10.0.in_layers.0.weight', None), + ('model.diffusion_model.input_blocks.10.0.in_layers.0.bias', None), + ('model.diffusion_model.input_blocks.10.0.in_layers.2.weight', (2, 3, 1, 0)), + ('model.diffusion_model.input_blocks.10.0.in_layers.2.bias', None), + ('model.diffusion_model.input_blocks.10.0.emb_layers.1.weight', (1, 0)), + ('model.diffusion_model.input_blocks.10.0.emb_layers.1.bias', None), + ('model.diffusion_model.input_blocks.10.0.out_layers.0.weight', None), + ('model.diffusion_model.input_blocks.10.0.out_layers.0.bias', None), + ('model.diffusion_model.input_blocks.10.0.out_layers.3.weight', + (2, 3, 1, 0)), + ('model.diffusion_model.input_blocks.10.0.out_layers.3.bias', None), + ('model.diffusion_model.input_blocks.11.0.in_layers.0.weight', None), + ('model.diffusion_model.input_blocks.11.0.in_layers.0.bias', None), + ('model.diffusion_model.input_blocks.11.0.in_layers.2.weight', (2, 3, 1, 0)), + ('model.diffusion_model.input_blocks.11.0.in_layers.2.bias', None), + ('model.diffusion_model.input_blocks.11.0.emb_layers.1.weight', (1, 0)), + ('model.diffusion_model.input_blocks.11.0.emb_layers.1.bias', None), + ('model.diffusion_model.input_blocks.11.0.out_layers.0.weight', None), + ('model.diffusion_model.input_blocks.11.0.out_layers.0.bias', None), + ('model.diffusion_model.input_blocks.11.0.out_layers.3.weight', + (2, 3, 1, 0)), + ('model.diffusion_model.input_blocks.11.0.out_layers.3.bias', None), + ('model.diffusion_model.middle_block.0.in_layers.0.weight', None), + ('model.diffusion_model.middle_block.0.in_layers.0.bias', None), + ('model.diffusion_model.middle_block.0.in_layers.2.weight', (2, 3, 1, 0)), + ('model.diffusion_model.middle_block.0.in_layers.2.bias', None), + ('model.diffusion_model.middle_block.0.emb_layers.1.weight', (1, 0)), + ('model.diffusion_model.middle_block.0.emb_layers.1.bias', None), + ('model.diffusion_model.middle_block.0.out_layers.0.weight', None), + ('model.diffusion_model.middle_block.0.out_layers.0.bias', None), + ('model.diffusion_model.middle_block.0.out_layers.3.weight', (2, 3, 1, 0)), + ('model.diffusion_model.middle_block.0.out_layers.3.bias', None), + ('model.diffusion_model.middle_block.1.norm.weight', None), + ('model.diffusion_model.middle_block.1.norm.bias', None), + ('model.diffusion_model.middle_block.1.proj_in.weight', (2, 3, 1, 0)), + ('model.diffusion_model.middle_block.1.proj_in.bias', None), + ('model.diffusion_model.middle_block.1.transformer_blocks.0.norm1.weight', + None), + ('model.diffusion_model.middle_block.1.transformer_blocks.0.norm1.bias', + None), + ('model.diffusion_model.middle_block.1.transformer_blocks.0.attn1.to_q.weight', + (1, 0)), + ('model.diffusion_model.middle_block.1.transformer_blocks.0.attn1.to_k.weight', + (1, 0)), + ('model.diffusion_model.middle_block.1.transformer_blocks.0.attn1.to_v.weight', + (1, 0)), + ('model.diffusion_model.middle_block.1.transformer_blocks.0.attn1.to_out.0.weight', + (1, 0)), + ('model.diffusion_model.middle_block.1.transformer_blocks.0.attn1.to_out.0.bias', + None), + ('model.diffusion_model.middle_block.1.transformer_blocks.0.norm2.weight', + None), + ('model.diffusion_model.middle_block.1.transformer_blocks.0.norm2.bias', + None), + ('model.diffusion_model.middle_block.1.transformer_blocks.0.attn2.to_q.weight', + (1, 0)), + ('model.diffusion_model.middle_block.1.transformer_blocks.0.attn2.to_k.weight', + (1, 0)), + ('model.diffusion_model.middle_block.1.transformer_blocks.0.attn2.to_v.weight', + (1, 0)), + ('model.diffusion_model.middle_block.1.transformer_blocks.0.attn2.to_out.0.weight', + (1, 0)), + ('model.diffusion_model.middle_block.1.transformer_blocks.0.attn2.to_out.0.bias', + None), + ('model.diffusion_model.middle_block.1.transformer_blocks.0.norm3.weight', + None), + ('model.diffusion_model.middle_block.1.transformer_blocks.0.norm3.bias', + None), + ('model.diffusion_model.middle_block.1.transformer_blocks.0.ff.net.0.proj.weight', + (1, 0)), + ('model.diffusion_model.middle_block.1.transformer_blocks.0.ff.net.0.proj.bias', + None), + ('model.diffusion_model.middle_block.1.transformer_blocks.0.ff.net.2.weight', + (1, 0)), + ('model.diffusion_model.middle_block.1.transformer_blocks.0.ff.net.2.bias', + None), + ('model.diffusion_model.middle_block.1.proj_out.weight', (2, 3, 1, 0)), + ('model.diffusion_model.middle_block.1.proj_out.bias', None), + ('model.diffusion_model.middle_block.2.in_layers.0.weight', None), + ('model.diffusion_model.middle_block.2.in_layers.0.bias', None), + ('model.diffusion_model.middle_block.2.in_layers.2.weight', (2, 3, 1, 0)), + ('model.diffusion_model.middle_block.2.in_layers.2.bias', None), + ('model.diffusion_model.middle_block.2.emb_layers.1.weight', (1, 0)), + ('model.diffusion_model.middle_block.2.emb_layers.1.bias', None), + ('model.diffusion_model.middle_block.2.out_layers.0.weight', None), + ('model.diffusion_model.middle_block.2.out_layers.0.bias', None), + ('model.diffusion_model.middle_block.2.out_layers.3.weight', (2, 3, 1, 0)), + ('model.diffusion_model.middle_block.2.out_layers.3.bias', None), + ('model.diffusion_model.output_blocks.0.0.in_layers.0.weight', None), + ('model.diffusion_model.output_blocks.0.0.in_layers.0.bias', None), + ('model.diffusion_model.output_blocks.0.0.in_layers.2.weight', (2, 3, 1, 0)), + ('model.diffusion_model.output_blocks.0.0.in_layers.2.bias', None), + ('model.diffusion_model.output_blocks.0.0.emb_layers.1.weight', (1, 0)), + ('model.diffusion_model.output_blocks.0.0.emb_layers.1.bias', None), + ('model.diffusion_model.output_blocks.0.0.out_layers.0.weight', None), + ('model.diffusion_model.output_blocks.0.0.out_layers.0.bias', None), + ('model.diffusion_model.output_blocks.0.0.out_layers.3.weight', + (2, 3, 1, 0)), + ('model.diffusion_model.output_blocks.0.0.out_layers.3.bias', None), + ('model.diffusion_model.output_blocks.0.0.skip_connection.weight', + (2, 3, 1, 0)), + ('model.diffusion_model.output_blocks.0.0.skip_connection.bias', None), + ('model.diffusion_model.output_blocks.1.0.in_layers.0.weight', None), + ('model.diffusion_model.output_blocks.1.0.in_layers.0.bias', None), + ('model.diffusion_model.output_blocks.1.0.in_layers.2.weight', (2, 3, 1, 0)), + ('model.diffusion_model.output_blocks.1.0.in_layers.2.bias', None), + ('model.diffusion_model.output_blocks.1.0.emb_layers.1.weight', (1, 0)), + ('model.diffusion_model.output_blocks.1.0.emb_layers.1.bias', None), + ('model.diffusion_model.output_blocks.1.0.out_layers.0.weight', None), + ('model.diffusion_model.output_blocks.1.0.out_layers.0.bias', None), + ('model.diffusion_model.output_blocks.1.0.out_layers.3.weight', + (2, 3, 1, 0)), + ('model.diffusion_model.output_blocks.1.0.out_layers.3.bias', None), + ('model.diffusion_model.output_blocks.1.0.skip_connection.weight', + (2, 3, 1, 0)), + ('model.diffusion_model.output_blocks.1.0.skip_connection.bias', None), + ('model.diffusion_model.output_blocks.2.0.in_layers.0.weight', None), + ('model.diffusion_model.output_blocks.2.0.in_layers.0.bias', None), + ('model.diffusion_model.output_blocks.2.0.in_layers.2.weight', (2, 3, 1, 0)), + ('model.diffusion_model.output_blocks.2.0.in_layers.2.bias', None), + ('model.diffusion_model.output_blocks.2.0.emb_layers.1.weight', (1, 0)), + ('model.diffusion_model.output_blocks.2.0.emb_layers.1.bias', None), + ('model.diffusion_model.output_blocks.2.0.out_layers.0.weight', None), + ('model.diffusion_model.output_blocks.2.0.out_layers.0.bias', None), + ('model.diffusion_model.output_blocks.2.0.out_layers.3.weight', + (2, 3, 1, 0)), + ('model.diffusion_model.output_blocks.2.0.out_layers.3.bias', None), + ('model.diffusion_model.output_blocks.2.0.skip_connection.weight', + (2, 3, 1, 0)), + ('model.diffusion_model.output_blocks.2.0.skip_connection.bias', None), + ('model.diffusion_model.output_blocks.2.1.conv.weight', (2, 3, 1, 0)), + ('model.diffusion_model.output_blocks.2.1.conv.bias', None), + ('model.diffusion_model.output_blocks.3.0.in_layers.0.weight', None), + ('model.diffusion_model.output_blocks.3.0.in_layers.0.bias', None), + ('model.diffusion_model.output_blocks.3.0.in_layers.2.weight', (2, 3, 1, 0)), + ('model.diffusion_model.output_blocks.3.0.in_layers.2.bias', None), + ('model.diffusion_model.output_blocks.3.0.emb_layers.1.weight', (1, 0)), + ('model.diffusion_model.output_blocks.3.0.emb_layers.1.bias', None), + ('model.diffusion_model.output_blocks.3.0.out_layers.0.weight', None), + ('model.diffusion_model.output_blocks.3.0.out_layers.0.bias', None), + ('model.diffusion_model.output_blocks.3.0.out_layers.3.weight', + (2, 3, 1, 0)), + ('model.diffusion_model.output_blocks.3.0.out_layers.3.bias', None), + ('model.diffusion_model.output_blocks.3.0.skip_connection.weight', + (2, 3, 1, 0)), + ('model.diffusion_model.output_blocks.3.0.skip_connection.bias', None), + ('model.diffusion_model.output_blocks.3.1.norm.weight', None), + ('model.diffusion_model.output_blocks.3.1.norm.bias', None), + ('model.diffusion_model.output_blocks.3.1.proj_in.weight', (2, 3, 1, 0)), + ('model.diffusion_model.output_blocks.3.1.proj_in.bias', None), + ('model.diffusion_model.output_blocks.3.1.transformer_blocks.0.norm1.weight', + None), + ('model.diffusion_model.output_blocks.3.1.transformer_blocks.0.norm1.bias', + None), + ('model.diffusion_model.output_blocks.3.1.transformer_blocks.0.attn1.to_q.weight', + (1, 0)), + ('model.diffusion_model.output_blocks.3.1.transformer_blocks.0.attn1.to_k.weight', + (1, 0)), + ('model.diffusion_model.output_blocks.3.1.transformer_blocks.0.attn1.to_v.weight', + (1, 0)), + ('model.diffusion_model.output_blocks.3.1.transformer_blocks.0.attn1.to_out.0.weight', + (1, 0)), + ('model.diffusion_model.output_blocks.3.1.transformer_blocks.0.attn1.to_out.0.bias', + None), + ('model.diffusion_model.output_blocks.3.1.transformer_blocks.0.norm2.weight', + None), + ('model.diffusion_model.output_blocks.3.1.transformer_blocks.0.norm2.bias', + None), + ('model.diffusion_model.output_blocks.3.1.transformer_blocks.0.attn2.to_q.weight', + (1, 0)), + ('model.diffusion_model.output_blocks.3.1.transformer_blocks.0.attn2.to_k.weight', + (1, 0)), + ('model.diffusion_model.output_blocks.3.1.transformer_blocks.0.attn2.to_v.weight', + (1, 0)), + ('model.diffusion_model.output_blocks.3.1.transformer_blocks.0.attn2.to_out.0.weight', + (1, 0)), + ('model.diffusion_model.output_blocks.3.1.transformer_blocks.0.attn2.to_out.0.bias', + None), + ('model.diffusion_model.output_blocks.3.1.transformer_blocks.0.norm3.weight', + None), + ('model.diffusion_model.output_blocks.3.1.transformer_blocks.0.norm3.bias', + None), + ('model.diffusion_model.output_blocks.3.1.transformer_blocks.0.ff.net.0.proj.weight', + (1, 0)), + ('model.diffusion_model.output_blocks.3.1.transformer_blocks.0.ff.net.0.proj.bias', + None), + ('model.diffusion_model.output_blocks.3.1.transformer_blocks.0.ff.net.2.weight', + (1, 0)), + ('model.diffusion_model.output_blocks.3.1.transformer_blocks.0.ff.net.2.bias', + None), + ('model.diffusion_model.output_blocks.3.1.proj_out.weight', (2, 3, 1, 0)), + ('model.diffusion_model.output_blocks.3.1.proj_out.bias', None), + ('model.diffusion_model.output_blocks.4.0.in_layers.0.weight', None), + ('model.diffusion_model.output_blocks.4.0.in_layers.0.bias', None), + ('model.diffusion_model.output_blocks.4.0.in_layers.2.weight', (2, 3, 1, 0)), + ('model.diffusion_model.output_blocks.4.0.in_layers.2.bias', None), + ('model.diffusion_model.output_blocks.4.0.emb_layers.1.weight', (1, 0)), + ('model.diffusion_model.output_blocks.4.0.emb_layers.1.bias', None), + ('model.diffusion_model.output_blocks.4.0.out_layers.0.weight', None), + ('model.diffusion_model.output_blocks.4.0.out_layers.0.bias', None), + ('model.diffusion_model.output_blocks.4.0.out_layers.3.weight', + (2, 3, 1, 0)), + ('model.diffusion_model.output_blocks.4.0.out_layers.3.bias', None), + ('model.diffusion_model.output_blocks.4.0.skip_connection.weight', + (2, 3, 1, 0)), + ('model.diffusion_model.output_blocks.4.0.skip_connection.bias', None), + ('model.diffusion_model.output_blocks.4.1.norm.weight', None), + ('model.diffusion_model.output_blocks.4.1.norm.bias', None), + ('model.diffusion_model.output_blocks.4.1.proj_in.weight', (2, 3, 1, 0)), + ('model.diffusion_model.output_blocks.4.1.proj_in.bias', None), + ('model.diffusion_model.output_blocks.4.1.transformer_blocks.0.norm1.weight', + None), + ('model.diffusion_model.output_blocks.4.1.transformer_blocks.0.norm1.bias', + None), + ('model.diffusion_model.output_blocks.4.1.transformer_blocks.0.attn1.to_q.weight', + (1, 0)), + ('model.diffusion_model.output_blocks.4.1.transformer_blocks.0.attn1.to_k.weight', + (1, 0)), + ('model.diffusion_model.output_blocks.4.1.transformer_blocks.0.attn1.to_v.weight', + (1, 0)), + ('model.diffusion_model.output_blocks.4.1.transformer_blocks.0.attn1.to_out.0.weight', + (1, 0)), + ('model.diffusion_model.output_blocks.4.1.transformer_blocks.0.attn1.to_out.0.bias', + None), + ('model.diffusion_model.output_blocks.4.1.transformer_blocks.0.norm2.weight', + None), + ('model.diffusion_model.output_blocks.4.1.transformer_blocks.0.norm2.bias', + None), + ('model.diffusion_model.output_blocks.4.1.transformer_blocks.0.attn2.to_q.weight', + (1, 0)), + ('model.diffusion_model.output_blocks.4.1.transformer_blocks.0.attn2.to_k.weight', + (1, 0)), + ('model.diffusion_model.output_blocks.4.1.transformer_blocks.0.attn2.to_v.weight', + (1, 0)), + ('model.diffusion_model.output_blocks.4.1.transformer_blocks.0.attn2.to_out.0.weight', + (1, 0)), + ('model.diffusion_model.output_blocks.4.1.transformer_blocks.0.attn2.to_out.0.bias', + None), + ('model.diffusion_model.output_blocks.4.1.transformer_blocks.0.norm3.weight', + None), + ('model.diffusion_model.output_blocks.4.1.transformer_blocks.0.norm3.bias', + None), + ('model.diffusion_model.output_blocks.4.1.transformer_blocks.0.ff.net.0.proj.weight', + (1, 0)), + ('model.diffusion_model.output_blocks.4.1.transformer_blocks.0.ff.net.0.proj.bias', + None), + ('model.diffusion_model.output_blocks.4.1.transformer_blocks.0.ff.net.2.weight', + (1, 0)), + ('model.diffusion_model.output_blocks.4.1.transformer_blocks.0.ff.net.2.bias', + None), + ('model.diffusion_model.output_blocks.4.1.proj_out.weight', (2, 3, 1, 0)), + ('model.diffusion_model.output_blocks.4.1.proj_out.bias', None), + ('model.diffusion_model.output_blocks.5.0.in_layers.0.weight', None), + ('model.diffusion_model.output_blocks.5.0.in_layers.0.bias', None), + ('model.diffusion_model.output_blocks.5.0.in_layers.2.weight', (2, 3, 1, 0)), + ('model.diffusion_model.output_blocks.5.0.in_layers.2.bias', None), + ('model.diffusion_model.output_blocks.5.0.emb_layers.1.weight', (1, 0)), + ('model.diffusion_model.output_blocks.5.0.emb_layers.1.bias', None), + ('model.diffusion_model.output_blocks.5.0.out_layers.0.weight', None), + ('model.diffusion_model.output_blocks.5.0.out_layers.0.bias', None), + ('model.diffusion_model.output_blocks.5.0.out_layers.3.weight', + (2, 3, 1, 0)), + ('model.diffusion_model.output_blocks.5.0.out_layers.3.bias', None), + ('model.diffusion_model.output_blocks.5.0.skip_connection.weight', + (2, 3, 1, 0)), + ('model.diffusion_model.output_blocks.5.0.skip_connection.bias', None), + ('model.diffusion_model.output_blocks.5.1.norm.weight', None), + ('model.diffusion_model.output_blocks.5.1.norm.bias', None), + ('model.diffusion_model.output_blocks.5.1.proj_in.weight', (2, 3, 1, 0)), + ('model.diffusion_model.output_blocks.5.1.proj_in.bias', None), + ('model.diffusion_model.output_blocks.5.1.transformer_blocks.0.norm1.weight', + None), + ('model.diffusion_model.output_blocks.5.1.transformer_blocks.0.norm1.bias', + None), + ('model.diffusion_model.output_blocks.5.1.transformer_blocks.0.attn1.to_q.weight', + (1, 0)), + ('model.diffusion_model.output_blocks.5.1.transformer_blocks.0.attn1.to_k.weight', + (1, 0)), + ('model.diffusion_model.output_blocks.5.1.transformer_blocks.0.attn1.to_v.weight', + (1, 0)), + ('model.diffusion_model.output_blocks.5.1.transformer_blocks.0.attn1.to_out.0.weight', + (1, 0)), + ('model.diffusion_model.output_blocks.5.1.transformer_blocks.0.attn1.to_out.0.bias', + None), + ('model.diffusion_model.output_blocks.5.1.transformer_blocks.0.norm2.weight', + None), + ('model.diffusion_model.output_blocks.5.1.transformer_blocks.0.norm2.bias', + None), + ('model.diffusion_model.output_blocks.5.1.transformer_blocks.0.attn2.to_q.weight', + (1, 0)), + ('model.diffusion_model.output_blocks.5.1.transformer_blocks.0.attn2.to_k.weight', + (1, 0)), + ('model.diffusion_model.output_blocks.5.1.transformer_blocks.0.attn2.to_v.weight', + (1, 0)), + ('model.diffusion_model.output_blocks.5.1.transformer_blocks.0.attn2.to_out.0.weight', + (1, 0)), + ('model.diffusion_model.output_blocks.5.1.transformer_blocks.0.attn2.to_out.0.bias', + None), + ('model.diffusion_model.output_blocks.5.1.transformer_blocks.0.norm3.weight', + None), + ('model.diffusion_model.output_blocks.5.1.transformer_blocks.0.norm3.bias', + None), + ('model.diffusion_model.output_blocks.5.1.transformer_blocks.0.ff.net.0.proj.weight', + (1, 0)), + ('model.diffusion_model.output_blocks.5.1.transformer_blocks.0.ff.net.0.proj.bias', + None), + ('model.diffusion_model.output_blocks.5.1.transformer_blocks.0.ff.net.2.weight', + (1, 0)), + ('model.diffusion_model.output_blocks.5.1.transformer_blocks.0.ff.net.2.bias', + None), + ('model.diffusion_model.output_blocks.5.1.proj_out.weight', (2, 3, 1, 0)), + ('model.diffusion_model.output_blocks.5.1.proj_out.bias', None), + ('model.diffusion_model.output_blocks.5.2.conv.weight', (2, 3, 1, 0)), + ('model.diffusion_model.output_blocks.5.2.conv.bias', None), + ('model.diffusion_model.output_blocks.6.0.in_layers.0.weight', None), + ('model.diffusion_model.output_blocks.6.0.in_layers.0.bias', None), + ('model.diffusion_model.output_blocks.6.0.in_layers.2.weight', (2, 3, 1, 0)), + ('model.diffusion_model.output_blocks.6.0.in_layers.2.bias', None), + ('model.diffusion_model.output_blocks.6.0.emb_layers.1.weight', (1, 0)), + ('model.diffusion_model.output_blocks.6.0.emb_layers.1.bias', None), + ('model.diffusion_model.output_blocks.6.0.out_layers.0.weight', None), + ('model.diffusion_model.output_blocks.6.0.out_layers.0.bias', None), + ('model.diffusion_model.output_blocks.6.0.out_layers.3.weight', + (2, 3, 1, 0)), + ('model.diffusion_model.output_blocks.6.0.out_layers.3.bias', None), + ('model.diffusion_model.output_blocks.6.0.skip_connection.weight', + (2, 3, 1, 0)), + ('model.diffusion_model.output_blocks.6.0.skip_connection.bias', None), + ('model.diffusion_model.output_blocks.6.1.norm.weight', None), + ('model.diffusion_model.output_blocks.6.1.norm.bias', None), + ('model.diffusion_model.output_blocks.6.1.proj_in.weight', (2, 3, 1, 0)), + ('model.diffusion_model.output_blocks.6.1.proj_in.bias', None), + ('model.diffusion_model.output_blocks.6.1.transformer_blocks.0.norm1.weight', + None), + ('model.diffusion_model.output_blocks.6.1.transformer_blocks.0.norm1.bias', + None), + ('model.diffusion_model.output_blocks.6.1.transformer_blocks.0.attn1.to_q.weight', + (1, 0)), + ('model.diffusion_model.output_blocks.6.1.transformer_blocks.0.attn1.to_k.weight', + (1, 0)), + ('model.diffusion_model.output_blocks.6.1.transformer_blocks.0.attn1.to_v.weight', + (1, 0)), + ('model.diffusion_model.output_blocks.6.1.transformer_blocks.0.attn1.to_out.0.weight', + (1, 0)), + ('model.diffusion_model.output_blocks.6.1.transformer_blocks.0.attn1.to_out.0.bias', + None), + ('model.diffusion_model.output_blocks.6.1.transformer_blocks.0.norm2.weight', + None), + ('model.diffusion_model.output_blocks.6.1.transformer_blocks.0.norm2.bias', + None), + ('model.diffusion_model.output_blocks.6.1.transformer_blocks.0.attn2.to_q.weight', + (1, 0)), + ('model.diffusion_model.output_blocks.6.1.transformer_blocks.0.attn2.to_k.weight', + (1, 0)), + ('model.diffusion_model.output_blocks.6.1.transformer_blocks.0.attn2.to_v.weight', + (1, 0)), + ('model.diffusion_model.output_blocks.6.1.transformer_blocks.0.attn2.to_out.0.weight', + (1, 0)), + ('model.diffusion_model.output_blocks.6.1.transformer_blocks.0.attn2.to_out.0.bias', + None), + ('model.diffusion_model.output_blocks.6.1.transformer_blocks.0.norm3.weight', + None), + ('model.diffusion_model.output_blocks.6.1.transformer_blocks.0.norm3.bias', + None), + ('model.diffusion_model.output_blocks.6.1.transformer_blocks.0.ff.net.0.proj.weight', + (1, 0)), + ('model.diffusion_model.output_blocks.6.1.transformer_blocks.0.ff.net.0.proj.bias', + None), + ('model.diffusion_model.output_blocks.6.1.transformer_blocks.0.ff.net.2.weight', + (1, 0)), + ('model.diffusion_model.output_blocks.6.1.transformer_blocks.0.ff.net.2.bias', + None), + ('model.diffusion_model.output_blocks.6.1.proj_out.weight', (2, 3, 1, 0)), + ('model.diffusion_model.output_blocks.6.1.proj_out.bias', None), + ('model.diffusion_model.output_blocks.7.0.in_layers.0.weight', None), + ('model.diffusion_model.output_blocks.7.0.in_layers.0.bias', None), + ('model.diffusion_model.output_blocks.7.0.in_layers.2.weight', (2, 3, 1, 0)), + ('model.diffusion_model.output_blocks.7.0.in_layers.2.bias', None), + ('model.diffusion_model.output_blocks.7.0.emb_layers.1.weight', (1, 0)), + ('model.diffusion_model.output_blocks.7.0.emb_layers.1.bias', None), + ('model.diffusion_model.output_blocks.7.0.out_layers.0.weight', None), + ('model.diffusion_model.output_blocks.7.0.out_layers.0.bias', None), + ('model.diffusion_model.output_blocks.7.0.out_layers.3.weight', + (2, 3, 1, 0)), + ('model.diffusion_model.output_blocks.7.0.out_layers.3.bias', None), + ('model.diffusion_model.output_blocks.7.0.skip_connection.weight', + (2, 3, 1, 0)), + ('model.diffusion_model.output_blocks.7.0.skip_connection.bias', None), + ('model.diffusion_model.output_blocks.7.1.norm.weight', None), + ('model.diffusion_model.output_blocks.7.1.norm.bias', None), + ('model.diffusion_model.output_blocks.7.1.proj_in.weight', (2, 3, 1, 0)), + ('model.diffusion_model.output_blocks.7.1.proj_in.bias', None), + ('model.diffusion_model.output_blocks.7.1.transformer_blocks.0.norm1.weight', + None), + ('model.diffusion_model.output_blocks.7.1.transformer_blocks.0.norm1.bias', + None), + ('model.diffusion_model.output_blocks.7.1.transformer_blocks.0.attn1.to_q.weight', + (1, 0)), + ('model.diffusion_model.output_blocks.7.1.transformer_blocks.0.attn1.to_k.weight', + (1, 0)), + ('model.diffusion_model.output_blocks.7.1.transformer_blocks.0.attn1.to_v.weight', + (1, 0)), + ('model.diffusion_model.output_blocks.7.1.transformer_blocks.0.attn1.to_out.0.weight', + (1, 0)), + ('model.diffusion_model.output_blocks.7.1.transformer_blocks.0.attn1.to_out.0.bias', + None), + ('model.diffusion_model.output_blocks.7.1.transformer_blocks.0.norm2.weight', + None), + ('model.diffusion_model.output_blocks.7.1.transformer_blocks.0.norm2.bias', + None), + ('model.diffusion_model.output_blocks.7.1.transformer_blocks.0.attn2.to_q.weight', + (1, 0)), + ('model.diffusion_model.output_blocks.7.1.transformer_blocks.0.attn2.to_k.weight', + (1, 0)), + ('model.diffusion_model.output_blocks.7.1.transformer_blocks.0.attn2.to_v.weight', + (1, 0)), + ('model.diffusion_model.output_blocks.7.1.transformer_blocks.0.attn2.to_out.0.weight', + (1, 0)), + ('model.diffusion_model.output_blocks.7.1.transformer_blocks.0.attn2.to_out.0.bias', + None), + ('model.diffusion_model.output_blocks.7.1.transformer_blocks.0.norm3.weight', + None), + ('model.diffusion_model.output_blocks.7.1.transformer_blocks.0.norm3.bias', + None), + ('model.diffusion_model.output_blocks.7.1.transformer_blocks.0.ff.net.0.proj.weight', + (1, 0)), + ('model.diffusion_model.output_blocks.7.1.transformer_blocks.0.ff.net.0.proj.bias', + None), + ('model.diffusion_model.output_blocks.7.1.transformer_blocks.0.ff.net.2.weight', + (1, 0)), + ('model.diffusion_model.output_blocks.7.1.transformer_blocks.0.ff.net.2.bias', + None), + ('model.diffusion_model.output_blocks.7.1.proj_out.weight', (2, 3, 1, 0)), + ('model.diffusion_model.output_blocks.7.1.proj_out.bias', None), + ('model.diffusion_model.output_blocks.8.0.in_layers.0.weight', None), + ('model.diffusion_model.output_blocks.8.0.in_layers.0.bias', None), + ('model.diffusion_model.output_blocks.8.0.in_layers.2.weight', (2, 3, 1, 0)), + ('model.diffusion_model.output_blocks.8.0.in_layers.2.bias', None), + ('model.diffusion_model.output_blocks.8.0.emb_layers.1.weight', (1, 0)), + ('model.diffusion_model.output_blocks.8.0.emb_layers.1.bias', None), + ('model.diffusion_model.output_blocks.8.0.out_layers.0.weight', None), + ('model.diffusion_model.output_blocks.8.0.out_layers.0.bias', None), + ('model.diffusion_model.output_blocks.8.0.out_layers.3.weight', + (2, 3, 1, 0)), + ('model.diffusion_model.output_blocks.8.0.out_layers.3.bias', None), + ('model.diffusion_model.output_blocks.8.0.skip_connection.weight', + (2, 3, 1, 0)), + ('model.diffusion_model.output_blocks.8.0.skip_connection.bias', None), + ('model.diffusion_model.output_blocks.8.1.norm.weight', None), + ('model.diffusion_model.output_blocks.8.1.norm.bias', None), + ('model.diffusion_model.output_blocks.8.1.proj_in.weight', (2, 3, 1, 0)), + ('model.diffusion_model.output_blocks.8.1.proj_in.bias', None), + ('model.diffusion_model.output_blocks.8.1.transformer_blocks.0.norm1.weight', + None), + ('model.diffusion_model.output_blocks.8.1.transformer_blocks.0.norm1.bias', + None), + ('model.diffusion_model.output_blocks.8.1.transformer_blocks.0.attn1.to_q.weight', + (1, 0)), + ('model.diffusion_model.output_blocks.8.1.transformer_blocks.0.attn1.to_k.weight', + (1, 0)), + ('model.diffusion_model.output_blocks.8.1.transformer_blocks.0.attn1.to_v.weight', + (1, 0)), + ('model.diffusion_model.output_blocks.8.1.transformer_blocks.0.attn1.to_out.0.weight', + (1, 0)), + ('model.diffusion_model.output_blocks.8.1.transformer_blocks.0.attn1.to_out.0.bias', + None), + ('model.diffusion_model.output_blocks.8.1.transformer_blocks.0.norm2.weight', + None), + ('model.diffusion_model.output_blocks.8.1.transformer_blocks.0.norm2.bias', + None), + ('model.diffusion_model.output_blocks.8.1.transformer_blocks.0.attn2.to_q.weight', + (1, 0)), + ('model.diffusion_model.output_blocks.8.1.transformer_blocks.0.attn2.to_k.weight', + (1, 0)), + ('model.diffusion_model.output_blocks.8.1.transformer_blocks.0.attn2.to_v.weight', + (1, 0)), + ('model.diffusion_model.output_blocks.8.1.transformer_blocks.0.attn2.to_out.0.weight', + (1, 0)), + ('model.diffusion_model.output_blocks.8.1.transformer_blocks.0.attn2.to_out.0.bias', + None), + ('model.diffusion_model.output_blocks.8.1.transformer_blocks.0.norm3.weight', + None), + ('model.diffusion_model.output_blocks.8.1.transformer_blocks.0.norm3.bias', + None), + ('model.diffusion_model.output_blocks.8.1.transformer_blocks.0.ff.net.0.proj.weight', + (1, 0)), + ('model.diffusion_model.output_blocks.8.1.transformer_blocks.0.ff.net.0.proj.bias', + None), + ('model.diffusion_model.output_blocks.8.1.transformer_blocks.0.ff.net.2.weight', + (1, 0)), + ('model.diffusion_model.output_blocks.8.1.transformer_blocks.0.ff.net.2.bias', + None), + ('model.diffusion_model.output_blocks.8.1.proj_out.weight', (2, 3, 1, 0)), + ('model.diffusion_model.output_blocks.8.1.proj_out.bias', None), + ('model.diffusion_model.output_blocks.8.2.conv.weight', (2, 3, 1, 0)), + ('model.diffusion_model.output_blocks.8.2.conv.bias', None), + ('model.diffusion_model.output_blocks.9.0.in_layers.0.weight', None), + ('model.diffusion_model.output_blocks.9.0.in_layers.0.bias', None), + ('model.diffusion_model.output_blocks.9.0.in_layers.2.weight', (2, 3, 1, 0)), + ('model.diffusion_model.output_blocks.9.0.in_layers.2.bias', None), + ('model.diffusion_model.output_blocks.9.0.emb_layers.1.weight', (1, 0)), + ('model.diffusion_model.output_blocks.9.0.emb_layers.1.bias', None), + ('model.diffusion_model.output_blocks.9.0.out_layers.0.weight', None), + ('model.diffusion_model.output_blocks.9.0.out_layers.0.bias', None), + ('model.diffusion_model.output_blocks.9.0.out_layers.3.weight', + (2, 3, 1, 0)), + ('model.diffusion_model.output_blocks.9.0.out_layers.3.bias', None), + ('model.diffusion_model.output_blocks.9.0.skip_connection.weight', + (2, 3, 1, 0)), + ('model.diffusion_model.output_blocks.9.0.skip_connection.bias', None), + ('model.diffusion_model.output_blocks.9.1.norm.weight', None), + ('model.diffusion_model.output_blocks.9.1.norm.bias', None), + ('model.diffusion_model.output_blocks.9.1.proj_in.weight', (2, 3, 1, 0)), + ('model.diffusion_model.output_blocks.9.1.proj_in.bias', None), + ('model.diffusion_model.output_blocks.9.1.transformer_blocks.0.norm1.weight', + None), + ('model.diffusion_model.output_blocks.9.1.transformer_blocks.0.norm1.bias', + None), + ('model.diffusion_model.output_blocks.9.1.transformer_blocks.0.attn1.to_q.weight', + (1, 0)), + ('model.diffusion_model.output_blocks.9.1.transformer_blocks.0.attn1.to_k.weight', + (1, 0)), + ('model.diffusion_model.output_blocks.9.1.transformer_blocks.0.attn1.to_v.weight', + (1, 0)), + ('model.diffusion_model.output_blocks.9.1.transformer_blocks.0.attn1.to_out.0.weight', + (1, 0)), + ('model.diffusion_model.output_blocks.9.1.transformer_blocks.0.attn1.to_out.0.bias', + None), + ('model.diffusion_model.output_blocks.9.1.transformer_blocks.0.norm2.weight', + None), + ('model.diffusion_model.output_blocks.9.1.transformer_blocks.0.norm2.bias', + None), + ('model.diffusion_model.output_blocks.9.1.transformer_blocks.0.attn2.to_q.weight', + (1, 0)), + ('model.diffusion_model.output_blocks.9.1.transformer_blocks.0.attn2.to_k.weight', + (1, 0)), + ('model.diffusion_model.output_blocks.9.1.transformer_blocks.0.attn2.to_v.weight', + (1, 0)), + ('model.diffusion_model.output_blocks.9.1.transformer_blocks.0.attn2.to_out.0.weight', + (1, 0)), + ('model.diffusion_model.output_blocks.9.1.transformer_blocks.0.attn2.to_out.0.bias', + None), + ('model.diffusion_model.output_blocks.9.1.transformer_blocks.0.norm3.weight', + None), + ('model.diffusion_model.output_blocks.9.1.transformer_blocks.0.norm3.bias', + None), + ('model.diffusion_model.output_blocks.9.1.transformer_blocks.0.ff.net.0.proj.weight', + (1, 0)), + ('model.diffusion_model.output_blocks.9.1.transformer_blocks.0.ff.net.0.proj.bias', + None), + ('model.diffusion_model.output_blocks.9.1.transformer_blocks.0.ff.net.2.weight', + (1, 0)), + ('model.diffusion_model.output_blocks.9.1.transformer_blocks.0.ff.net.2.bias', + None), + ('model.diffusion_model.output_blocks.9.1.proj_out.weight', (2, 3, 1, 0)), + ('model.diffusion_model.output_blocks.9.1.proj_out.bias', None), + ('model.diffusion_model.output_blocks.10.0.in_layers.0.weight', None), + ('model.diffusion_model.output_blocks.10.0.in_layers.0.bias', None), + ('model.diffusion_model.output_blocks.10.0.in_layers.2.weight', + (2, 3, 1, 0)), + ('model.diffusion_model.output_blocks.10.0.in_layers.2.bias', None), + ('model.diffusion_model.output_blocks.10.0.emb_layers.1.weight', (1, 0)), + ('model.diffusion_model.output_blocks.10.0.emb_layers.1.bias', None), + ('model.diffusion_model.output_blocks.10.0.out_layers.0.weight', None), + ('model.diffusion_model.output_blocks.10.0.out_layers.0.bias', None), + ('model.diffusion_model.output_blocks.10.0.out_layers.3.weight', + (2, 3, 1, 0)), + ('model.diffusion_model.output_blocks.10.0.out_layers.3.bias', None), + ('model.diffusion_model.output_blocks.10.0.skip_connection.weight', + (2, 3, 1, 0)), + ('model.diffusion_model.output_blocks.10.0.skip_connection.bias', None), + ('model.diffusion_model.output_blocks.10.1.norm.weight', None), + ('model.diffusion_model.output_blocks.10.1.norm.bias', None), + ('model.diffusion_model.output_blocks.10.1.proj_in.weight', (2, 3, 1, 0)), + ('model.diffusion_model.output_blocks.10.1.proj_in.bias', None), + ('model.diffusion_model.output_blocks.10.1.transformer_blocks.0.norm1.weight', + None), + ('model.diffusion_model.output_blocks.10.1.transformer_blocks.0.norm1.bias', + None), + ('model.diffusion_model.output_blocks.10.1.transformer_blocks.0.attn1.to_q.weight', + (1, 0)), + ('model.diffusion_model.output_blocks.10.1.transformer_blocks.0.attn1.to_k.weight', + (1, 0)), + ('model.diffusion_model.output_blocks.10.1.transformer_blocks.0.attn1.to_v.weight', + (1, 0)), + ('model.diffusion_model.output_blocks.10.1.transformer_blocks.0.attn1.to_out.0.weight', + (1, 0)), + ('model.diffusion_model.output_blocks.10.1.transformer_blocks.0.attn1.to_out.0.bias', + None), + ('model.diffusion_model.output_blocks.10.1.transformer_blocks.0.norm2.weight', + None), + ('model.diffusion_model.output_blocks.10.1.transformer_blocks.0.norm2.bias', + None), + ('model.diffusion_model.output_blocks.10.1.transformer_blocks.0.attn2.to_q.weight', + (1, 0)), + ('model.diffusion_model.output_blocks.10.1.transformer_blocks.0.attn2.to_k.weight', + (1, 0)), + ('model.diffusion_model.output_blocks.10.1.transformer_blocks.0.attn2.to_v.weight', + (1, 0)), + ('model.diffusion_model.output_blocks.10.1.transformer_blocks.0.attn2.to_out.0.weight', + (1, 0)), + ('model.diffusion_model.output_blocks.10.1.transformer_blocks.0.attn2.to_out.0.bias', + None), + ('model.diffusion_model.output_blocks.10.1.transformer_blocks.0.norm3.weight', + None), + ('model.diffusion_model.output_blocks.10.1.transformer_blocks.0.norm3.bias', + None), + ('model.diffusion_model.output_blocks.10.1.transformer_blocks.0.ff.net.0.proj.weight', + (1, 0)), + ('model.diffusion_model.output_blocks.10.1.transformer_blocks.0.ff.net.0.proj.bias', + None), + ('model.diffusion_model.output_blocks.10.1.transformer_blocks.0.ff.net.2.weight', + (1, 0)), + ('model.diffusion_model.output_blocks.10.1.transformer_blocks.0.ff.net.2.bias', + None), + ('model.diffusion_model.output_blocks.10.1.proj_out.weight', (2, 3, 1, 0)), + ('model.diffusion_model.output_blocks.10.1.proj_out.bias', None), + ('model.diffusion_model.output_blocks.11.0.in_layers.0.weight', None), + ('model.diffusion_model.output_blocks.11.0.in_layers.0.bias', None), + ('model.diffusion_model.output_blocks.11.0.in_layers.2.weight', + (2, 3, 1, 0)), + ('model.diffusion_model.output_blocks.11.0.in_layers.2.bias', None), + ('model.diffusion_model.output_blocks.11.0.emb_layers.1.weight', (1, 0)), + ('model.diffusion_model.output_blocks.11.0.emb_layers.1.bias', None), + ('model.diffusion_model.output_blocks.11.0.out_layers.0.weight', None), + ('model.diffusion_model.output_blocks.11.0.out_layers.0.bias', None), + ('model.diffusion_model.output_blocks.11.0.out_layers.3.weight', + (2, 3, 1, 0)), + ('model.diffusion_model.output_blocks.11.0.out_layers.3.bias', None), + ('model.diffusion_model.output_blocks.11.0.skip_connection.weight', + (2, 3, 1, 0)), + ('model.diffusion_model.output_blocks.11.0.skip_connection.bias', None), + ('model.diffusion_model.output_blocks.11.1.norm.weight', None), + ('model.diffusion_model.output_blocks.11.1.norm.bias', None), + ('model.diffusion_model.output_blocks.11.1.proj_in.weight', (2, 3, 1, 0)), + ('model.diffusion_model.output_blocks.11.1.proj_in.bias', None), + ('model.diffusion_model.output_blocks.11.1.transformer_blocks.0.norm1.weight', + None), + ('model.diffusion_model.output_blocks.11.1.transformer_blocks.0.norm1.bias', + None), + ('model.diffusion_model.output_blocks.11.1.transformer_blocks.0.attn1.to_q.weight', + (1, 0)), + ('model.diffusion_model.output_blocks.11.1.transformer_blocks.0.attn1.to_k.weight', + (1, 0)), + ('model.diffusion_model.output_blocks.11.1.transformer_blocks.0.attn1.to_v.weight', + (1, 0)), + ('model.diffusion_model.output_blocks.11.1.transformer_blocks.0.attn1.to_out.0.weight', + (1, 0)), + ('model.diffusion_model.output_blocks.11.1.transformer_blocks.0.attn1.to_out.0.bias', + None), + ('model.diffusion_model.output_blocks.11.1.transformer_blocks.0.norm2.weight', + None), + ('model.diffusion_model.output_blocks.11.1.transformer_blocks.0.norm2.bias', + None), + ('model.diffusion_model.output_blocks.11.1.transformer_blocks.0.attn2.to_q.weight', + (1, 0)), + ('model.diffusion_model.output_blocks.11.1.transformer_blocks.0.attn2.to_k.weight', + (1, 0)), + ('model.diffusion_model.output_blocks.11.1.transformer_blocks.0.attn2.to_v.weight', + (1, 0)), + ('model.diffusion_model.output_blocks.11.1.transformer_blocks.0.attn2.to_out.0.weight', + (1, 0)), + ('model.diffusion_model.output_blocks.11.1.transformer_blocks.0.attn2.to_out.0.bias', + None), + ('model.diffusion_model.output_blocks.11.1.transformer_blocks.0.norm3.weight', + None), + ('model.diffusion_model.output_blocks.11.1.transformer_blocks.0.norm3.bias', + None), + ('model.diffusion_model.output_blocks.11.1.transformer_blocks.0.ff.net.0.proj.weight', + (1, 0)), + ('model.diffusion_model.output_blocks.11.1.transformer_blocks.0.ff.net.0.proj.bias', + None), + ('model.diffusion_model.output_blocks.11.1.transformer_blocks.0.ff.net.2.weight', + (1, 0)), + ('model.diffusion_model.output_blocks.11.1.transformer_blocks.0.ff.net.2.bias', + None), + ('model.diffusion_model.output_blocks.11.1.proj_out.weight', (2, 3, 1, 0)), + ('model.diffusion_model.output_blocks.11.1.proj_out.bias', None), + ('model.diffusion_model.out.0.weight', None), + ('model.diffusion_model.out.0.bias', None), + ('model.diffusion_model.out.2.weight', (2, 3, 1, 0)), + ('model.diffusion_model.out.2.bias', None)], + 'decoder': [('first_stage_model.post_quant_conv.weight', (2, 3, 1, 0)), + ('first_stage_model.post_quant_conv.bias', None), + ('first_stage_model.decoder.conv_in.weight', (2, 3, 1, 0)), + ('first_stage_model.decoder.conv_in.bias', None), + ('first_stage_model.decoder.mid.block_1.norm1.weight', None), + ('first_stage_model.decoder.mid.block_1.norm1.bias', None), + ('first_stage_model.decoder.mid.block_1.conv1.weight', (2, 3, 1, 0)), + ('first_stage_model.decoder.mid.block_1.conv1.bias', None), + ('first_stage_model.decoder.mid.block_1.norm2.weight', None), + ('first_stage_model.decoder.mid.block_1.norm2.bias', None), + ('first_stage_model.decoder.mid.block_1.conv2.weight', (2, 3, 1, 0)), + ('first_stage_model.decoder.mid.block_1.conv2.bias', None), + ('first_stage_model.decoder.mid.attn_1.norm.weight', None), + ('first_stage_model.decoder.mid.attn_1.norm.bias', None), + ('first_stage_model.decoder.mid.attn_1.q.weight', (2, 3, 1, 0)), + ('first_stage_model.decoder.mid.attn_1.q.bias', None), + ('first_stage_model.decoder.mid.attn_1.k.weight', (2, 3, 1, 0)), + ('first_stage_model.decoder.mid.attn_1.k.bias', None), + ('first_stage_model.decoder.mid.attn_1.v.weight', (2, 3, 1, 0)), + ('first_stage_model.decoder.mid.attn_1.v.bias', None), + ('first_stage_model.decoder.mid.attn_1.proj_out.weight', (2, 3, 1, 0)), + ('first_stage_model.decoder.mid.attn_1.proj_out.bias', None), + ('first_stage_model.decoder.mid.block_2.norm1.weight', None), + ('first_stage_model.decoder.mid.block_2.norm1.bias', None), + ('first_stage_model.decoder.mid.block_2.conv1.weight', (2, 3, 1, 0)), + ('first_stage_model.decoder.mid.block_2.conv1.bias', None), + ('first_stage_model.decoder.mid.block_2.norm2.weight', None), + ('first_stage_model.decoder.mid.block_2.norm2.bias', None), + ('first_stage_model.decoder.mid.block_2.conv2.weight', (2, 3, 1, 0)), + ('first_stage_model.decoder.mid.block_2.conv2.bias', None), + ('first_stage_model.decoder.up.3.block.0.norm1.weight', None), + ('first_stage_model.decoder.up.3.block.0.norm1.bias', None), + ('first_stage_model.decoder.up.3.block.0.conv1.weight', (2, 3, 1, 0)), + ('first_stage_model.decoder.up.3.block.0.conv1.bias', None), + ('first_stage_model.decoder.up.3.block.0.norm2.weight', None), + ('first_stage_model.decoder.up.3.block.0.norm2.bias', None), + ('first_stage_model.decoder.up.3.block.0.conv2.weight', (2, 3, 1, 0)), + ('first_stage_model.decoder.up.3.block.0.conv2.bias', None), + ('first_stage_model.decoder.up.3.block.1.norm1.weight', None), + ('first_stage_model.decoder.up.3.block.1.norm1.bias', None), + ('first_stage_model.decoder.up.3.block.1.conv1.weight', (2, 3, 1, 0)), + ('first_stage_model.decoder.up.3.block.1.conv1.bias', None), + ('first_stage_model.decoder.up.3.block.1.norm2.weight', None), + ('first_stage_model.decoder.up.3.block.1.norm2.bias', None), + ('first_stage_model.decoder.up.3.block.1.conv2.weight', (2, 3, 1, 0)), + ('first_stage_model.decoder.up.3.block.1.conv2.bias', None), + ('first_stage_model.decoder.up.3.block.2.norm1.weight', None), + ('first_stage_model.decoder.up.3.block.2.norm1.bias', None), + ('first_stage_model.decoder.up.3.block.2.conv1.weight', (2, 3, 1, 0)), + ('first_stage_model.decoder.up.3.block.2.conv1.bias', None), + ('first_stage_model.decoder.up.3.block.2.norm2.weight', None), + ('first_stage_model.decoder.up.3.block.2.norm2.bias', None), + ('first_stage_model.decoder.up.3.block.2.conv2.weight', (2, 3, 1, 0)), + ('first_stage_model.decoder.up.3.block.2.conv2.bias', None), + ('first_stage_model.decoder.up.3.upsample.conv.weight', (2, 3, 1, 0)), + ('first_stage_model.decoder.up.3.upsample.conv.bias', None), + ('first_stage_model.decoder.up.2.block.0.norm1.weight', None), + ('first_stage_model.decoder.up.2.block.0.norm1.bias', None), + ('first_stage_model.decoder.up.2.block.0.conv1.weight', (2, 3, 1, 0)), + ('first_stage_model.decoder.up.2.block.0.conv1.bias', None), + ('first_stage_model.decoder.up.2.block.0.norm2.weight', None), + ('first_stage_model.decoder.up.2.block.0.norm2.bias', None), + ('first_stage_model.decoder.up.2.block.0.conv2.weight', (2, 3, 1, 0)), + ('first_stage_model.decoder.up.2.block.0.conv2.bias', None), + ('first_stage_model.decoder.up.2.block.1.norm1.weight', None), + ('first_stage_model.decoder.up.2.block.1.norm1.bias', None), + ('first_stage_model.decoder.up.2.block.1.conv1.weight', (2, 3, 1, 0)), + ('first_stage_model.decoder.up.2.block.1.conv1.bias', None), + ('first_stage_model.decoder.up.2.block.1.norm2.weight', None), + ('first_stage_model.decoder.up.2.block.1.norm2.bias', None), + ('first_stage_model.decoder.up.2.block.1.conv2.weight', (2, 3, 1, 0)), + ('first_stage_model.decoder.up.2.block.1.conv2.bias', None), + ('first_stage_model.decoder.up.2.block.2.norm1.weight', None), + ('first_stage_model.decoder.up.2.block.2.norm1.bias', None), + ('first_stage_model.decoder.up.2.block.2.conv1.weight', (2, 3, 1, 0)), + ('first_stage_model.decoder.up.2.block.2.conv1.bias', None), + ('first_stage_model.decoder.up.2.block.2.norm2.weight', None), + ('first_stage_model.decoder.up.2.block.2.norm2.bias', None), + ('first_stage_model.decoder.up.2.block.2.conv2.weight', (2, 3, 1, 0)), + ('first_stage_model.decoder.up.2.block.2.conv2.bias', None), + ('first_stage_model.decoder.up.2.upsample.conv.weight', (2, 3, 1, 0)), + ('first_stage_model.decoder.up.2.upsample.conv.bias', None), + ('first_stage_model.decoder.up.1.block.0.norm1.weight', None), + ('first_stage_model.decoder.up.1.block.0.norm1.bias', None), + ('first_stage_model.decoder.up.1.block.0.conv1.weight', (2, 3, 1, 0)), + ('first_stage_model.decoder.up.1.block.0.conv1.bias', None), + ('first_stage_model.decoder.up.1.block.0.norm2.weight', None), + ('first_stage_model.decoder.up.1.block.0.norm2.bias', None), + ('first_stage_model.decoder.up.1.block.0.conv2.weight', (2, 3, 1, 0)), + ('first_stage_model.decoder.up.1.block.0.conv2.bias', None), + ('first_stage_model.decoder.up.1.block.0.nin_shortcut.weight', (2, 3, 1, 0)), + ('first_stage_model.decoder.up.1.block.0.nin_shortcut.bias', None), + ('first_stage_model.decoder.up.1.block.1.norm1.weight', None), + ('first_stage_model.decoder.up.1.block.1.norm1.bias', None), + ('first_stage_model.decoder.up.1.block.1.conv1.weight', (2, 3, 1, 0)), + ('first_stage_model.decoder.up.1.block.1.conv1.bias', None), + ('first_stage_model.decoder.up.1.block.1.norm2.weight', None), + ('first_stage_model.decoder.up.1.block.1.norm2.bias', None), + ('first_stage_model.decoder.up.1.block.1.conv2.weight', (2, 3, 1, 0)), + ('first_stage_model.decoder.up.1.block.1.conv2.bias', None), + ('first_stage_model.decoder.up.1.block.2.norm1.weight', None), + ('first_stage_model.decoder.up.1.block.2.norm1.bias', None), + ('first_stage_model.decoder.up.1.block.2.conv1.weight', (2, 3, 1, 0)), + ('first_stage_model.decoder.up.1.block.2.conv1.bias', None), + ('first_stage_model.decoder.up.1.block.2.norm2.weight', None), + ('first_stage_model.decoder.up.1.block.2.norm2.bias', None), + ('first_stage_model.decoder.up.1.block.2.conv2.weight', (2, 3, 1, 0)), + ('first_stage_model.decoder.up.1.block.2.conv2.bias', None), + ('first_stage_model.decoder.up.1.upsample.conv.weight', (2, 3, 1, 0)), + ('first_stage_model.decoder.up.1.upsample.conv.bias', None), + ('first_stage_model.decoder.up.0.block.0.norm1.weight', None), + ('first_stage_model.decoder.up.0.block.0.norm1.bias', None), + ('first_stage_model.decoder.up.0.block.0.conv1.weight', (2, 3, 1, 0)), + ('first_stage_model.decoder.up.0.block.0.conv1.bias', None), + ('first_stage_model.decoder.up.0.block.0.norm2.weight', None), + ('first_stage_model.decoder.up.0.block.0.norm2.bias', None), + ('first_stage_model.decoder.up.0.block.0.conv2.weight', (2, 3, 1, 0)), + ('first_stage_model.decoder.up.0.block.0.conv2.bias', None), + ('first_stage_model.decoder.up.0.block.0.nin_shortcut.weight', (2, 3, 1, 0)), + ('first_stage_model.decoder.up.0.block.0.nin_shortcut.bias', None), + ('first_stage_model.decoder.up.0.block.1.norm1.weight', None), + ('first_stage_model.decoder.up.0.block.1.norm1.bias', None), + ('first_stage_model.decoder.up.0.block.1.conv1.weight', (2, 3, 1, 0)), + ('first_stage_model.decoder.up.0.block.1.conv1.bias', None), + ('first_stage_model.decoder.up.0.block.1.norm2.weight', None), + ('first_stage_model.decoder.up.0.block.1.norm2.bias', None), + ('first_stage_model.decoder.up.0.block.1.conv2.weight', (2, 3, 1, 0)), + ('first_stage_model.decoder.up.0.block.1.conv2.bias', None), + ('first_stage_model.decoder.up.0.block.2.norm1.weight', None), + ('first_stage_model.decoder.up.0.block.2.norm1.bias', None), + ('first_stage_model.decoder.up.0.block.2.conv1.weight', (2, 3, 1, 0)), + ('first_stage_model.decoder.up.0.block.2.conv1.bias', None), + ('first_stage_model.decoder.up.0.block.2.norm2.weight', None), + ('first_stage_model.decoder.up.0.block.2.norm2.bias', None), + ('first_stage_model.decoder.up.0.block.2.conv2.weight', (2, 3, 1, 0)), + ('first_stage_model.decoder.up.0.block.2.conv2.bias', None), + ('first_stage_model.decoder.norm_out.weight', None), + ('first_stage_model.decoder.norm_out.bias', None), + ('first_stage_model.decoder.conv_out.weight', (2, 3, 1, 0)), + ('first_stage_model.decoder.conv_out.bias', None)], + 'encoder': [('first_stage_model.encoder.conv_in.weight', (2, 3, 1, 0)), + ('first_stage_model.encoder.conv_in.bias', None), + ('first_stage_model.encoder.down.0.block.0.norm1.weight', None), + ('first_stage_model.encoder.down.0.block.0.norm1.bias', None), + ('first_stage_model.encoder.down.0.block.0.conv1.weight', (2, 3, 1, 0)), + ('first_stage_model.encoder.down.0.block.0.conv1.bias', None), + ('first_stage_model.encoder.down.0.block.0.norm2.weight', None), + ('first_stage_model.encoder.down.0.block.0.norm2.bias', None), + ('first_stage_model.encoder.down.0.block.0.conv2.weight', (2, 3, 1, 0)), + ('first_stage_model.encoder.down.0.block.0.conv2.bias', None), + ('first_stage_model.encoder.down.0.block.1.norm1.weight', None), + ('first_stage_model.encoder.down.0.block.1.norm1.bias', None), + ('first_stage_model.encoder.down.0.block.1.conv1.weight', (2, 3, 1, 0)), + ('first_stage_model.encoder.down.0.block.1.conv1.bias', None), + ('first_stage_model.encoder.down.0.block.1.norm2.weight', None), + ('first_stage_model.encoder.down.0.block.1.norm2.bias', None), + ('first_stage_model.encoder.down.0.block.1.conv2.weight', (2, 3, 1, 0)), + ('first_stage_model.encoder.down.0.block.1.conv2.bias', None), + ('first_stage_model.encoder.down.0.downsample.conv.weight', (2, 3, 1, 0)), + ('first_stage_model.encoder.down.0.downsample.conv.bias', None), + ('first_stage_model.encoder.down.1.block.0.norm1.weight', None), + ('first_stage_model.encoder.down.1.block.0.norm1.bias', None), + ('first_stage_model.encoder.down.1.block.0.conv1.weight', (2, 3, 1, 0)), + ('first_stage_model.encoder.down.1.block.0.conv1.bias', None), + ('first_stage_model.encoder.down.1.block.0.norm2.weight', None), + ('first_stage_model.encoder.down.1.block.0.norm2.bias', None), + ('first_stage_model.encoder.down.1.block.0.conv2.weight', (2, 3, 1, 0)), + ('first_stage_model.encoder.down.1.block.0.conv2.bias', None), + ('first_stage_model.encoder.down.1.block.0.nin_shortcut.weight', + (2, 3, 1, 0)), + ('first_stage_model.encoder.down.1.block.0.nin_shortcut.bias', None), + ('first_stage_model.encoder.down.1.block.1.norm1.weight', None), + ('first_stage_model.encoder.down.1.block.1.norm1.bias', None), + ('first_stage_model.encoder.down.1.block.1.conv1.weight', (2, 3, 1, 0)), + ('first_stage_model.encoder.down.1.block.1.conv1.bias', None), + ('first_stage_model.encoder.down.1.block.1.norm2.weight', None), + ('first_stage_model.encoder.down.1.block.1.norm2.bias', None), + ('first_stage_model.encoder.down.1.block.1.conv2.weight', (2, 3, 1, 0)), + ('first_stage_model.encoder.down.1.block.1.conv2.bias', None), + ('first_stage_model.encoder.down.1.downsample.conv.weight', (2, 3, 1, 0)), + ('first_stage_model.encoder.down.1.downsample.conv.bias', None), + ('first_stage_model.encoder.down.2.block.0.norm1.weight', None), + ('first_stage_model.encoder.down.2.block.0.norm1.bias', None), + ('first_stage_model.encoder.down.2.block.0.conv1.weight', (2, 3, 1, 0)), + ('first_stage_model.encoder.down.2.block.0.conv1.bias', None), + ('first_stage_model.encoder.down.2.block.0.norm2.weight', None), + ('first_stage_model.encoder.down.2.block.0.norm2.bias', None), + ('first_stage_model.encoder.down.2.block.0.conv2.weight', (2, 3, 1, 0)), + ('first_stage_model.encoder.down.2.block.0.conv2.bias', None), + ('first_stage_model.encoder.down.2.block.0.nin_shortcut.weight', + (2, 3, 1, 0)), + ('first_stage_model.encoder.down.2.block.0.nin_shortcut.bias', None), + ('first_stage_model.encoder.down.2.block.1.norm1.weight', None), + ('first_stage_model.encoder.down.2.block.1.norm1.bias', None), + ('first_stage_model.encoder.down.2.block.1.conv1.weight', (2, 3, 1, 0)), + ('first_stage_model.encoder.down.2.block.1.conv1.bias', None), + ('first_stage_model.encoder.down.2.block.1.norm2.weight', None), + ('first_stage_model.encoder.down.2.block.1.norm2.bias', None), + ('first_stage_model.encoder.down.2.block.1.conv2.weight', (2, 3, 1, 0)), + ('first_stage_model.encoder.down.2.block.1.conv2.bias', None), + ('first_stage_model.encoder.down.2.downsample.conv.weight', (2, 3, 1, 0)), + ('first_stage_model.encoder.down.2.downsample.conv.bias', None), + ('first_stage_model.encoder.down.3.block.0.norm1.weight', None), + ('first_stage_model.encoder.down.3.block.0.norm1.bias', None), + ('first_stage_model.encoder.down.3.block.0.conv1.weight', (2, 3, 1, 0)), + ('first_stage_model.encoder.down.3.block.0.conv1.bias', None), + ('first_stage_model.encoder.down.3.block.0.norm2.weight', None), + ('first_stage_model.encoder.down.3.block.0.norm2.bias', None), + ('first_stage_model.encoder.down.3.block.0.conv2.weight', (2, 3, 1, 0)), + ('first_stage_model.encoder.down.3.block.0.conv2.bias', None), + ('first_stage_model.encoder.down.3.block.1.norm1.weight', None), + ('first_stage_model.encoder.down.3.block.1.norm1.bias', None), + ('first_stage_model.encoder.down.3.block.1.conv1.weight', (2, 3, 1, 0)), + ('first_stage_model.encoder.down.3.block.1.conv1.bias', None), + ('first_stage_model.encoder.down.3.block.1.norm2.weight', None), + ('first_stage_model.encoder.down.3.block.1.norm2.bias', None), + ('first_stage_model.encoder.down.3.block.1.conv2.weight', (2, 3, 1, 0)), + ('first_stage_model.encoder.down.3.block.1.conv2.bias', None), + ('first_stage_model.encoder.mid.block_1.norm1.weight', None), + ('first_stage_model.encoder.mid.block_1.norm1.bias', None), + ('first_stage_model.encoder.mid.block_1.conv1.weight', (2, 3, 1, 0)), + ('first_stage_model.encoder.mid.block_1.conv1.bias', None), + ('first_stage_model.encoder.mid.block_1.norm2.weight', None), + ('first_stage_model.encoder.mid.block_1.norm2.bias', None), + ('first_stage_model.encoder.mid.block_1.conv2.weight', (2, 3, 1, 0)), + ('first_stage_model.encoder.mid.block_1.conv2.bias', None), + ('first_stage_model.encoder.mid.attn_1.norm.weight', None), + ('first_stage_model.encoder.mid.attn_1.norm.bias', None), + ('first_stage_model.encoder.mid.attn_1.q.weight', (2, 3, 1, 0)), + ('first_stage_model.encoder.mid.attn_1.q.bias', None), + ('first_stage_model.encoder.mid.attn_1.k.weight', (2, 3, 1, 0)), + ('first_stage_model.encoder.mid.attn_1.k.bias', None), + ('first_stage_model.encoder.mid.attn_1.v.weight', (2, 3, 1, 0)), + ('first_stage_model.encoder.mid.attn_1.v.bias', None), + ('first_stage_model.encoder.mid.attn_1.proj_out.weight', (2, 3, 1, 0)), + ('first_stage_model.encoder.mid.attn_1.proj_out.bias', None), + ('first_stage_model.encoder.mid.block_2.norm1.weight', None), + ('first_stage_model.encoder.mid.block_2.norm1.bias', None), + ('first_stage_model.encoder.mid.block_2.conv1.weight', (2, 3, 1, 0)), + ('first_stage_model.encoder.mid.block_2.conv1.bias', None), + ('first_stage_model.encoder.mid.block_2.norm2.weight', None), + ('first_stage_model.encoder.mid.block_2.norm2.bias', None), + ('first_stage_model.encoder.mid.block_2.conv2.weight', (2, 3, 1, 0)), + ('first_stage_model.encoder.mid.block_2.conv2.bias', None), + ('first_stage_model.encoder.norm_out.weight', None), + ('first_stage_model.encoder.norm_out.bias', None), + ('first_stage_model.encoder.conv_out.weight', (2, 3, 1, 0)), + ('first_stage_model.encoder.conv_out.bias', None), + ('first_stage_model.quant_conv.weight', (2, 3, 1, 0)), + ('first_stage_model.quant_conv.bias', None)]} + + _UNCONDITIONAL_TOKENS = [ 49406, 49407, diff --git a/backends/stable_diffusion_tf/stable_diffusion_tf/stable_diffusion.py b/backends/stable_diffusion_tf/stable_diffusion_tf/stable_diffusion.py index 0029e77b..ab6fac2d 100644 --- a/backends/stable_diffusion_tf/stable_diffusion_tf/stable_diffusion.py +++ b/backends/stable_diffusion_tf/stable_diffusion_tf/stable_diffusion.py @@ -8,7 +8,7 @@ from .diffusion_model import UNetModel from .clip_encoder import CLIPTextTransformer from .clip_tokenizer import SimpleTokenizer -from .constants import _UNCONDITIONAL_TOKENS, _ALPHAS_CUMPROD +from .constants import _UNCONDITIONAL_TOKENS, _ALPHAS_CUMPROD, PYTORCH_CKPT_MAPPING from .stdin_input import is_avail, get_input from PIL import Image, ImageOps MAX_TEXT_LEN = 77 @@ -28,17 +28,18 @@ def process_inp_img(input_image): input_image = ImageOps.fit(input_image, (new_w, new_h), method = Image.BILINEAR , bleed = 0.0, centering =(0.5, 0.5)) input_image = np.array(input_image)[... , :3] - input_image = (input_image.astype("float") / 255.0)*2 - 1 + input_image = (input_image.astype("float32") / 255.0)*2 - 1 return new_h , new_w , input_image class StableDiffusion: - def __init__(self, img_height=1000, img_width=1000, jit_compile=False, download_weights=True): + def __init__(self, img_height=1000, img_width=1000, jit_compile=False, download_weights=True , is_sd_15_inpaint=False): self.img_height = img_height self.img_width = img_width self.tokenizer = SimpleTokenizer() + self.is_sd_15_inpaint = is_sd_15_inpaint - text_encoder, diffusion_model, decoder, encoder , text_encoder_f , diffusion_model_f , decoder_f , encoder_f = get_models(img_height, img_width, download_weights=download_weights) + text_encoder, diffusion_model, decoder, encoder , text_encoder_f , diffusion_model_f , decoder_f , encoder_f = get_models(img_height, img_width, download_weights=download_weights, is_sd_15_inpaint=is_sd_15_inpaint ) self.text_encoder = text_encoder self.diffusion_model = diffusion_model self.decoder = decoder @@ -147,9 +148,16 @@ def generate( img_height, img_width , timesteps, batch_size, seed , input_image=input_image, input_img_noise_t=input_img_noise_t ) - if input_image is not None: + if input_image is not None and (not self.is_sd_15_inpaint): timesteps = timesteps[: int(len(timesteps)*input_image_strength)] + + + if self.is_sd_15_inpaint: + masked_img = input_image[None] * (1-mask_image[None]) + masked_inp_enc = self.encoder_f( masked_img ) + masked_inp_enc = tf.repeat(masked_inp_enc , batch_size , axis=0) + # Diffusion stage ii = 0 progbar = tqdm(list(enumerate(timesteps))[::-1]) @@ -163,8 +171,13 @@ def generate( percentage = 100*ii/len(timesteps) ii += 1 print("sdbk dnpr "+str(percentage) ) # done percentage + + latent_cat = latent + if self.is_sd_15_inpaint: + latent_cat = tf.concat([latent , tf.repeat(mask_image_sm[None] , batch_size , axis=0) , masked_inp_enc ], axis=-1) + e_t = self.get_model_output( - latent, + latent_cat, timestep, context, unconditional_context, @@ -176,7 +189,7 @@ def generate( latent, e_t, index, a_t, a_prev, temperature, seed + index ) - if mask_image is not None and input_image is not None: + if mask_image is not None and input_image is not None and not self.is_sd_15_inpaint : # If mask is provided, noise at current timestep will be added to input image. # The intermediate latent will be merged with input latent. latent_orgin, _, _ = self.get_starting_parameters( @@ -191,8 +204,8 @@ def generate( else: decoded = self.decoder.predict_on_batch(latent) - if mask_image is not None: - decoded = input_image * (1-mask_image) + decoded * mask_image + if mask_image is not None and (not self.is_sd_15_inpaint): + decoded = input_image[None] * (1-mask_image[None]) + decoded * mask_image[None] decoded = ((decoded + 1) / 2) * 255 return np.clip(decoded, 0, 255).astype("uint8") @@ -262,7 +275,7 @@ def get_starting_parameters(self, img_height, img_width , timesteps, batch_size, alphas = [_ALPHAS_CUMPROD[t] for t in timesteps] alphas_prev = [1.0] + alphas[:-1] - if input_image is None: + if input_image is None or self.is_sd_15_inpaint : latent_np = np.random.RandomState(seed).normal(size=(batch_size, n_h, n_w, 4)).astype('float32') latent = tf.convert_to_tensor(latent_np) else: @@ -274,8 +287,21 @@ def get_starting_parameters(self, img_height, img_width , timesteps, batch_size, # latent = tf.random.normal((batch_size, n_h, n_w, 4), seed=seed) return latent, alphas, alphas_prev - -def get_models(img_height, img_width, download_weights=True): + def load_weights_from_pytorch_ckpt(self , pytorch_ckpt_path): + import torch + pt_weights = torch.load(pytorch_ckpt_path) + for module_name in ['text_encoder', 'diffusion_model', 'decoder', 'encoder' ]: + module_weights = [] + for i , (key , perm ) in enumerate(PYTORCH_CKPT_MAPPING[module_name]): + w = pt_weights['state_dict'][key].numpy() + if perm is not None: + w = np.transpose(w , perm ) + module_weights.append(w) + getattr(self, module_name).set_weights(module_weights) + print("Loaded %d weights for %s"%(len(module_weights) , module_name)) + + +def get_models(img_height, img_width, download_weights=True, is_sd_15_inpaint=False): n_h = img_height // 8 n_w = img_width // 8 @@ -289,7 +315,11 @@ def get_models(img_height, img_width, download_weights=True): # Creation diffusion UNet context = tf.keras.layers.Input((MAX_TEXT_LEN, 768)) t_emb = tf.keras.layers.Input((320,)) - latent = tf.keras.layers.Input((n_h, n_w, 4)) + if is_sd_15_inpaint: + n_unet_ch = 4 + 4 + 1 + else: + n_unet_ch = 4 + latent = tf.keras.layers.Input((n_h, n_w, n_unet_ch)) unet = UNetModel() diffusion_model_f = unet diffusion_model = tf.keras.models.Model( diff --git a/electron_app/src/components/Img2Img.vue b/electron_app/src/components/Img2Img.vue index d8780d2c..5fabf7ee 100644 --- a/electron_app/src/components/Img2Img.vue +++ b/electron_app/src/components/Img2Img.vue @@ -28,59 +28,19 @@ style="border-radius: 12px 12px 12px 12px; width: calc(100%); resize: none; " class="form-control" v-bind:class="{ 'disabled' : !stable_diffusion.is_input_avail}" - rows="3"> + :rows="is_negative_prompt_avail ? 2:3"> +
Generate
- -
- - - - - - - - - - - - - - - - - - - - - - - - - - - - - - -
- - +
Stopping ...
@@ -130,6 +90,7 @@ import ImageCanvas from '../components_bare/ImageCanvas.vue' import LoaderModal from '../components_bare/LoaderModal.vue' import Vue from 'vue' +import SDOptionsDropdown from '../components_bare/SDOptionsDropdown.vue' export default { name: 'Img2Img', @@ -137,13 +98,16 @@ export default { app_state : Object , stable_diffusion : Object, }, - components: {LoaderModal, ImageItem , ImageCanvas}, + components: { LoaderModal, ImageItem, ImageCanvas, SDOptionsDropdown }, mounted() { }, computed:{ is_sd_active(){ return this.stable_diffusion.is_input_avail; + }, + this_object(){ + return this; } }, watch: { @@ -170,6 +134,10 @@ export default { img_h : 512 , img_w : 512 , is_inpaint : false, + guidence_scale : 7.5 , + + is_negative_prompt_avail : false, + negative_prompt : "", }; }, methods: { @@ -216,6 +184,10 @@ export default { } if(this.is_inpaint) params['mask_image'] = mask_img; + + if(this.is_negative_prompt_avail) + params['negative_prompt'] = this.negative_prompt; + let that = this; this.backend_error = ""; Vue.set(this,'generated_images' ,[]); @@ -227,11 +199,18 @@ export default { on_img(img_path){ that.generated_images.push(img_path); - if(!(that.app_state.history[history_key])) - Vue.set(that.app_state.history, history_key , { + + let p = { "prompt":that.prompt , "seed": seed, "key":history_key , "imgs" : [] , "inp_img": input_image_with_mask, - "dif_steps" : that.dif_steps , "inp_img_strength" : that.inp_img_strength, "model_version": that.stable_diffusion.model_version - }); + "dif_steps" : that.dif_steps , "inp_img_strength" : that.inp_img_strength, "model_version": that.stable_diffusion.model_version , "guidence_scale" : that.guidence_scale , + } + if(that.stable_diffusion.model_version) + p['model_version'] = that.stable_diffusion.model_version; + if(that.is_negative_prompt_avail) + p['negative_prompt'] = that.negative_prompt; + + if(!(that.app_state.history[history_key])) + Vue.set(that.app_state.history, history_key , p ); that.app_state.history[history_key].imgs.push(img_path) diff --git a/electron_app/src/components/ImgGenerate.vue b/electron_app/src/components/ImgGenerate.vue index 826e14e2..0c0984d0 100644 --- a/electron_app/src/components/ImgGenerate.vue +++ b/electron_app/src/components/ImgGenerate.vue @@ -28,83 +28,7 @@
Advanced options
--> -
- - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - -
Enable Negative Prompt
-
Disable Negative Prompt
- -
-
-
+
@@ -198,6 +122,7 @@ import LoaderModal from '../components_bare/LoaderModal.vue' import Vue from 'vue' import ImageItem from '../components/ImageItem.vue' import {share_on_arthub} from '../utils.js' +import SDOptionsDropdown from '../components_bare/SDOptionsDropdown.vue' export default { name: 'ImgGenerate', @@ -205,7 +130,7 @@ export default { app_state : Object , stable_diffusion : Object, }, - components: {LoaderModal, ImageItem}, + components: { LoaderModal, ImageItem, SDOptionsDropdown }, mounted() { }, @@ -230,6 +155,11 @@ export default { }; }, + computed: { + this_object(){ + return this; + } + }, methods: { generate_from_prompt(){ let seed = 0; diff --git a/electron_app/src/components_bare/SDOptionsDropdown.vue b/electron_app/src/components_bare/SDOptionsDropdown.vue new file mode 100644 index 00000000..2645ce89 --- /dev/null +++ b/electron_app/src/components_bare/SDOptionsDropdown.vue @@ -0,0 +1,121 @@ + + + + \ No newline at end of file