Looking inside the Pipeline of Hugging Face

How do we get to the following?

from diffusers import StableDiffusionPipeline

To get started at a high level, let's download some the key components needed for assembling everything

CLIPTextModel: turn captions into vectors
CLIPTokenizer:
AutoencoderKL: how to squish our image down to a manageable size
UNet2DConditionModel: the unet model

from transformers import CLIPTextModel, CLIPTokenizer
from diffusers import AutoencoderKL, UNet2DConditionModel
from diffusers import LMSDiscreteScheduler

1. Get the different components

Similar to the huggingface pattern, will download the different parts

vae = AutoencoderKL.from_pretrained(
    "stabilityai/sd-vae-ft-ema",
    torch_dtype=torch.float16
).to("cuda")

unet = UNet2DConditionModel.from_pretrained(
    "CompVis/stable-diffusion-v1-4",
    subfolder="unet",
    torch_dtype=torch.float16
).to("cuda")

Next we need a noise generator, or level of noise intensity generator. Will use the following relationship

The following is a scheduler is from Katherine Crowson https://github.com/crowsonkb a AI/Generative artist

beta_start, beta_end = 0.00085, 0.012
plt.plot(torch.linspace(beta_start**0.5, beta_end**0.5, 1000) ** 2)
plt.xlabel('Timestep')
plt.ylabel('β');

--image--

scheduler = LMSDiscreteScheduler(
    beta_start=beta_start,
    beta_end=beta_end,
    beta_schedule="scaled_linear",
    num_train_timesteps=1000
)

2. Lets generate the astronaut on a horse again

Here's the settings that we will use

prompt = ["a photograph of an astronaut riding a horse"]

height = 512
width = 512
num_inference_steps = 70
guidance_scale = 7.5
batch_size = 1

text_input = tokenizer(
    prompt,
    padding="max_length",
    max_length=tokenizer.model_max_length,
    truncation=True,
    return_tensors="pt"
)

Looking at the tokenized text: note that the 49407 represents padding to ensure all inputs are the same length.

text_input['input_ids'] # torch.Size([1, 77])

# "a photograph of an astronaut riding a horse"
>>> tensor([[49406,   320,  8853,   539,   550, 18376,  6765,   320,  4558, 49407,
             49407, 49407, 49407, 49407, 49407, 49407, 49407, 49407, 49407, 49407,
             49407, 49407, 49407, 49407, 49407, 49407, 49407, 49407, 49407, 49407,
             49407, 49407, 49407, 49407, 49407, 49407, 49407, 49407, 49407, 49407,
             49407, 49407, 49407, 49407, 49407, 49407, 49407, 49407, 49407, 49407,
             49407, 49407, 49407, 49407, 49407, 49407, 49407, 49407, 49407, 49407,
             49407, 49407, 49407, 49407, 49407, 49407, 49407, 49407, 49407, 49407,
             49407, 49407, 49407, 49407, 49407, 49407, 49407]])

tokenizer.decode(49407)
>>> "<|endoftext|>"

In addition, the text_input also has an attention mask to ignore these "EOT" tokens:

text_input["attention_mask"]  # torch.Size([1, 77])

# "a photograph of an astronaut riding a horse"
>>> tensor([[1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
         0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
         0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
         0, 0, 0, 0, 0]])

Now one the text is run through the encoder, the appropriate embeddings are mapped over

text_embeddings = text_encoder(text_input.input_ids.to("cuda"))[0].half()
text_embeddings.shape

# >>> torch.Size([1, 77, 768])

From our discussion before, we need to generate a un-guided photo, but as a result, still need to pass empty inputs to the encoder

Note that .half() means floating point 16, or fp16 or float16 which trades off decimal precision for size

# ensure the input is the same length
max_length = text_input.input_ids.shape[-1]

# the token will be the empty string
uncond_input = tokenizer(
    [""] * batch_size, padding="max_length", max_length=max_length, return_tensors="pt"
)

# will get the same size, but the embeddings will the the same
uncond_embeddings = text_encoder(uncond_input.input_ids.to("cuda"))[0].half()
uncond_embeddings.shape

# >>> torch.Size([1, 77, 768]) 

# combine both inputs together for processing purposes
text_embeddings = torch.cat([uncond_embeddings, text_embeddings])

normal gaussian noise is generated, and note that the noise generated is at the compressed level, not at the original size:

original size: 512px x 512px x 3
compressed size: 64px x 64px x 4

# lets create some noise
torch.manual_seed(100)
latents = torch.randn((batch_size, unet.in_channels, height // 8, width // 8))
latents = latents.to("cuda").half()
latents.shape

"""
>>> torch.Size([1, 4, 64, 64])
tensor([[[[ 0.1268,  1.3564,  0.5630,  ..., -0.8042, -0.6245, -0.5884],
          [ 1.6699, -0.9272, -0.9761,  ...,  2.3848, -0.0355, -0.3179],
          [ 0.3579, -1.7842, -0.3052,  ...,  0.4880, -2.5781,  1.0010],
          ...
"""

# set the scheduler - currently 70 steps
scheduler.set_timesteps(num_inference_steps)

# tensor = tensor * tensor(14.6146) as an example
latents = latents * scheduler.init_noise_sigma

"""
Normally would be a full 1000 steps, but our methods will be skipping a bit
To look at the approximations. Its not really a "timestep" it is just means how much noise

scheduler.timesteps

>>> tensor([999.0000, 984.5217, 970.0435, 955.5652, 941.0870, 926.6087, 912.1304,
            897.6522, 883.1739, 868.6957, 854.2174, 839.7391, 825.2609, 810.7826,
            796.3043, 781.8261, 767.3478, 752.8696, 738.3913, 723.9130, 709.4348,
            ... for 70 steps ... 0.0000]

scheduler.sigmas
>>> tensor([14.6146, 13.3974, 12.3033, 11.3184, 10.4301,  9.6279,  8.9020,  8.2443,
            7.6472,  7.1044,  6.6102,  6.1594,  5.7477,  5.3709,  5.0258,  4.7090,
            4.4178,  4.1497,  3.9026,  3.6744,  3.4634,  3.2680,  3.0867,  2.9183,
            ... for 70 steps .... 0.000]

Reminder that sigma is the 'variance' or the amount of noise, that is decreasing, starting
with lots of noise and decreasing it
"""

from tqdm.auto import tqdm

# loop through decreasing amounts of noise
for i, t in enumerate(tqdm(scheduler.timesteps)):
    input = torch.cat([latents] * 2)
    input = scheduler.scale_model_input(input, t)

    # predict the noise residual
    with torch.no_grad():
        # remember the text_embeddings is [empty embeddings, prompt embeddings]
        # - passes some initial noise (input),
        # - a noise constant t, and our text ebmeddings input
        pred = unet(input, t, encoder_hidden_states=text_embeddings).sample
        # pred is torch.Size([1, 4, 64, 64])

    # perform guidance
    pred_uncond, pred_text = pred.chunk(2)

    # this takes the starting picture + adds some small part of the guidance
    pred = pred_uncond + guidance_scale * (pred_text - pred_uncond)

    # compute the "previous" noisy sample
    # the latents are updated and then will be fed in again
    latents = scheduler.step(pred, t, latents).prev_sample

"""
Once this is complete, decompress the image with the VAE
to get generated + adjusted photo, then feed it through PIL to take a look
"""
with torch.no_grad():
    image = vae.decode(1 / 0.18215 * latents).sample
    # 0.18215 - is a constant determined by the paper
    # image.shape -> torch.Size([1, 3, 512, 512])

image = (image / 2 + 0.5).clamp(0, 1)
image = image[0].detach().cpu().permute(1, 2, 0).numpy()
image = (image * 255).round().astype("uint8")
Image.fromarray(image)

3. Refactoring for programming agility

The following code is the same as above, but will allow for reusability

def text_enc(prompts: List[str], maxlen: int = None) -> torch.TensorType:
    """Encodes a list of strings
    Returns:
        torch.TensorType[torch.Size([1, 77, 768]), float16]
    """
    if maxlen is None:
        maxlen = tokenizer.model_max_length
    inp = tokenizer(
        prompts, padding="max_length", max_length=maxlen, truncation=True, return_tensors="pt"
    )
    return text_encoder(inp.input_ids.to("cuda"))[0].half()


def mk_img(t):
    """transforms raw decompressed image into PIL-compatible ranges + sizes"""
    image = (t / 2 + 0.5).clamp(0, 1).detach().cpu().permute(1, 2, 0).numpy()
    return Image.fromarray((image*255).round().astype("uint8"))


def mk_samples(prompts: List[str], g=7.5, seed=100, steps=70):
    """infers based on Unet, and returns a VAE decoded image of torch.Size([1, 3, 512, 512])"""
    bs = len(prompts)
    text = text_enc(prompts)
    uncond = text_enc([""] * bs, text.shape[1])
    emb = torch.cat([uncond, text])
    if seed: torch.manual_seed(seed)

    latents = torch.randn((bs, unet.in_channels, height//8, width//8))
    scheduler.set_timesteps(steps)
    latents = latents.to("cuda").half() * scheduler.init_noise_sigma

    for i,ts in enumerate(tqdm(scheduler.timesteps)):
        inp = scheduler.scale_model_input(torch.cat([latents] * 2), ts)
        with torch.no_grad(): u,t = unet(inp, ts, encoder_hidden_states=emb).sample.chunk(2)
        pred = u + g*(t-u)
        latents = scheduler.step(pred, ts, latents).prev_sample

    with torch.no_grad():
        return vae.decode(1 / 0.18215 * latents).sample

And now can run some samples in only a few lines:

prompts = [
    "a photograph of an astronaut riding a horse",
    "an oil painting of an astronaut riding a horse in the style of grant wood"
]

# will run the exact process above
images = mk_samples(prompts)

Homework

try and condense Image2Image, Negative Prompts, to really understand the code