CLIP: Contrastive Language-Image Pretraining

1. Data

1.1 image_processor

# 1.The first processor, usually is the clip pretrained model (vit)
# 224*224
# image_processor do not used in opt
from transformers.models.clip.image_processing_clip import CLIPImageProcessor.preprocess
image_processor = CLIPImageProcessor.from_pretrained("~/.cache/huggingface/hub/models-openai-clip-vit-large-patch14")
processor = image_processor          
image = processor.preprocess(image, return_tensors='pt')['pixel_values'][0]

# 2.The second processor_high, usually is the designed image encoder (sam/swin/cnn)
# 1024*1024
from vary.model.plug.transforms import train_transform
image_processor_high = train_transform
processor_high = image_processor_high
image_high = processor_high(image_high)

1.2 multimodal_processor

<image> -> <img> + [<imgpad>] * 256 + </img>

1.3 token_processor

    """
    Given a list of sources, each is a conversation list. This transform:
    1. Add signal '### ' at the beginning each sentence, with end signal '\n';
    2. Concatenate conversations together;
    3. Tokenize the concatenated conversation;
    4. Make a deepcopy as the target. Mask human words with IGNORE_INDEX.
    """
    DEFAULT_IM_START_TOKEN = <img>
    DEFAULT_IM_END_TOKEN = </img>

    conversation = [
        {
            'from': 'human', 
            'value': '<img><imgpad><imgpad><imgpad><imgpad><imgpad><imgpad><imgpad><imgpad><imgpad><imgpad><imgpad><imgpad><imgpad><imgpad><imgpad><imgpad><imgpad><imgpad><imgpad><imgpad><imgpad><imgpad><imgpad><imgpad><imgpad><imgpad><imgpad><imgpad><imgpad><imgpad><imgpad><imgpad><imgpad><imgpad><imgpad><imgpad><imgpad><imgpad><imgpad><imgpad><imgpad><imgpad><imgpad><imgpad><imgpad><imgpad><imgpad><imgpad><imgpad><imgpad><imgpad><imgpad><imgpad><imgpad><imgpad><imgpad><imgpad><imgpad><imgpad><imgpad><imgpad><imgpad><imgpad><imgpad><imgpad><imgpad><imgpad><imgpad><imgpad><imgpad><imgpad><imgpad><imgpad><imgpad><imgpad><imgpad><imgpad><imgpad><imgpad><imgpad><imgpad><imgpad><imgpad><imgpad><imgpad><imgpad><imgpad><imgpad><imgpad><imgpad><imgpad><imgpad><imgpad><imgpad><imgpad><imgpad><imgpad><imgpad><imgpad><imgpad><imgpad><imgpad><imgpad><imgpad><imgpad><imgpad><imgpad><imgpad><imgpad><imgpad><imgpad><imgpad><imgpad><imgpad><imgpad><imgpad><imgpad><imgpad><imgpad><imgpad><imgpad><imgpad><imgpad><imgpad><imgpad><imgpad><imgpad><imgpad><imgpad><imgpad><imgpad><imgpad><imgpad><imgpad><imgpad><imgpad><imgpad><imgpad><imgpad><imgpad><imgpad><imgpad><imgpad><imgpad><imgpad><imgpad><imgpad><imgpad><imgpad><imgpad><imgpad><imgpad><imgpad><imgpad><imgpad><imgpad><imgpad><imgpad><imgpad><imgpad><imgpad><imgpad><imgpad><imgpad><imgpad><imgpad><imgpad><imgpad><imgpad><imgpad><imgpad><imgpad><imgpad><imgpad><imgpad><imgpad><imgpad><imgpad><imgpad><imgpad><imgpad><imgpad><imgpad><imgpad><imgpad><imgpad><imgpad><imgpad><imgpad><imgpad><imgpad><imgpad><imgpad><imgpad><imgpad><imgpad><imgpad><imgpad><imgpad><imgpad><imgpad><imgpad><imgpad><imgpad><imgpad><imgpad><imgpad><imgpad><imgpad><imgpad><imgpad><imgpad><imgpad><imgpad><imgpad><imgpad><imgpad><imgpad><imgpad><imgpad><imgpad><imgpad><imgpad><imgpad><imgpad><imgpad><imgpad><imgpad><imgpad><imgpad><imgpad><imgpad><imgpad><imgpad><imgpad><imgpad><imgpad><imgpad><imgpad><imgpad><imgpad><imgpad><imgpad><imgpad><imgpad><imgpad><imgpad><imgpad><imgpad><imgpad><imgpad><imgpad><imgpad><imgpad><imgpad><imgpad></img>'
        }, 
        {
            'from': 'gpt', 
            'value': 'The man is sitting on top of piled objects or belongings loaded into the back of a pickup truck.'}, {'from': 'human', 'value': 'Is the man holding anything in his hands?'
        }, 
        {
            'from': 'gpt', 
            'value': 'Yes, the man is holding a beer in his hand while sitting on top of the objects in the back of the pickup truck.'}, {'from': 'human', 'value': 'What color is the pickup truck?'
        }, 
        {
            'from': 'gpt', 
            'value': 'The pickup truck is white.'
        }, 
        {
            'from': 'human', 
            'value': 'Is the man sitting or standing?'
        }, 
        {
            'from': 'gpt', 
            'value': 'The man is sitting on top of the piled objects in the back of the pickup truck.'
        }, 
        {
            'from': 'human', 
            'value': 'What could be the possible reasons for the man sitting on top of the possessions in the back of the pickup truck?'
        }, 
        {
            'from': 'gpt', 
            'value': "There could be several reasons for the man sitting on top of his possessions in the back of the pickup truck:\n\n1. Moving: The man might be moving to a new location and needed to transport his items in a pickup truck, utilizing available space efficiently. By sitting on top of the belongings, he could be helping to stabilize and secure the items during the move, preventing them from falling or shifting during transportation.\n\n2. Lack of seating: If the cab of the pickup truck is already at full capacity or there isn't enough space for him to sit inside, the man may have chosen to sit on his possessions as an alternative seating arrangement.\n\n3. Road trip or outing: The man might be on a road trip or a casual outing with friends or family, where he is using the back of the pickup truck as an open-air seating area. By sitting on top of the loaded items, he may be enjoying the journey while savoring his beer.\n\n4. Keeping an eye on belongings: The man could be safeguarding his possessions by staying close to them, ensuring that no items are lost, stolen or damaged during the journey.\n\nRegardless of the specific reason, the image shows a person making the most of their situation, adding a touch of lightheartedness or adventure to an otherwise mundane scene."
        }
    ]
text = "\n".join(conversation["value"]) + "</s>"    # # pseudo code

tokenized_list = [
    self.tokenizer(
        text,
        return_tensors="pt",
        padding="longest",
        max_length=self.tokenizer.model_max_length,
        truncation=True,
    ) for text in strings
]

# input_ids = labels 
input_ids = labels = [
    tokenized.input_ids[0] for tokenized in tokenized_list
]

# labels.repalce("...", -100)
IGNORE_INDEX = -100
lables = labels.replace("<imgpad>", -100).replace("human", -100)  # pseudo code

2.Model

varyOPTModel -> OPTModel -> OPTDecoder -> OPTDecoderLayer

2.1 varyOPTModel

class varyOPTModel(OPTModel):
    def __init__(self, config: OPTConfig):
        super(varyOPTModel, self).__init__(config)

        self.vision_tower = build_sam_vit_b()

        self.mm_projector = nn.Linear(1024, 768)

2.2 OPTModel

class OPTModel(OPTPreTrainedModel):
    def __init__(self, config: OPTConfig):
        super().__init__(config)
        self.decoder = OPTDecoder(config)

2.3 OPTDecoder

class OPTDecoder(OPTPreTrainedModel):
    """
    Transformer decoder consisting of *config.num_hidden_layers* layers. Each layer is a [`OPTDecoderLayer`]
    """
    self.embed_tokens = nn.Embedding(config.vocab_size, config.word_embed_proj_dim, self.padding_idx)
    self.embed_positions = OPTLearnedPositionalEmbedding(config.max_position_embeddings, config.hidden_size)
    self.layers = nn.ModuleList([OPTDecoderLayer(config) for _ in range(config.num_hidden_layers)])

2.4 OPTDecoderLayer

class OPTDecoderLayer(nn.Module):
    self.self_attn = OPTAttention(
            embed_dim=self.embed_dim,
            num_heads=config.num_attention_heads,
            dropout=config.attention_dropout,
            is_decoder=True,
            bias=config.enable_bias,
    )
    self.activation_fn = ACT2FN[config.activation_function]

    self.self_attn_layer_norm = nn.LayerNorm(
        self.embed_dim, elementwise_affine=config.layer_norm_elementwise_affine
    )
    self.fc1 = nn.Linear(self.embed_dim, config.ffn_dim, bias=config.enable_bias)
    self.fc2 = nn.Linear(config.ffn_dim, self.embed_dim, bias=config.enable_bias)
    self.final_layer_norm = nn.LayerNorm(self.embed_dim, elementwise_affine=config.layer_norm_elementwise_affine)

# vision tower build as below!!!
self.vision_tower = build_sam_vit_b()

def build_sam_vit_b(checkpoint=None):
    return _build_sam(
        encoder_embed_dim=768,
        encoder_depth=12,
        encoder_num_heads=12,
        encoder_global_attn_indexes=[2, 5, 8, 11],
        checkpoint=checkpoint,
    )


def _build_sam(
    encoder_embed_dim,
    encoder_depth,
    encoder_num_heads,
    encoder_global_attn_indexes,
):
    prompt_embed_dim = 256
    image_size = 1024
    vit_patch_size = 16

    # This class of ImageEncoderViT and its supporting functions below lightly adapted from the ViTDet backbone available at: https://github.com/facebookresearch/detectron2/blob/main/detectron2/modeling/backbone/vit.py # noqa

    image_encoder=ImageEncoderViT(
            depth=encoder_depth,
            embed_dim=encoder_embed_dim,
            img_size=image_size,
            mlp_ratio=4,
            norm_layer=partial(torch.nn.LayerNorm, eps=1e-6),
            num_heads=encoder_num_heads,
            patch_size=vit_patch_size,
            qkv_bias=True,
            use_rel_pos=True,
            global_attn_indexes=encoder_global_attn_indexes,
            window_size=14,
            out_chans=prompt_embed_dim,
        )

    return image_encoder
# image embed + text embed
input_embeds = torch.cat(
                        (
                            input_embeds[:image_start_token_pos+1], 
                            image_features,    num_patches: 256
                            input_embeds[image_start_token_pos + num_patches + 1:]
                        ), 
                        dim=0
                    )

Vision Vocabulary:CLIP的视觉字典是指该模型通过对比学习从大规模图像和文本数据中学到的关于图像的表示和语义信息的集合。
 在训练过程中,CLIP学会了将图像嵌入(embed)到一个高维空间中,并在该空间中通过文本描述对图像进行分类或检索。这个视觉字典的结构是通过模型的架构和训练数据来定义的。

Share on: TwitterFacebookEmail

Comments


Related Posts


Published

Category

NLP

Tags

Contact