介绍

GGUF 是一种文件格式,用于存储用于 GGML 推理的模型和基于 GGML 的执行器。 GGUF 是一种二进制格式,旨在快速加载和保存模型并易于阅读。传统上,模型是使用 PyTorch 或其他框架开发的,然后转换为 GGUF 以在 GGML 中使用。

gguf

准备工作

基于生成式多模态文字识别模型GOT_OCR2.0示例。首先枚举GOT_OCR2.0模型全部权重键值对,获取模型网络结构命名。方法如下:

got_model = GOTQwenForCausalLM.from_pretrained(model_path)
tensors = []
for key, value in got_model.state_dict().items():
    tensors.append((key, value))
    print(f"{key}   ->   {list(value.shape)}")
print(f"Tensor count: {len(tensors)}")
model.embed_tokens.weight   ->   [151860, 1024]
model.layers.0.self_attn.q_proj.weight   ->   [1024, 1024]
model.layers.0.self_attn.q_proj.bias   ->   [1024]
model.layers.0.self_attn.k_proj.weight   ->   [1024, 1024]
model.layers.0.self_attn.k_proj.bias   ->   [1024]
model.layers.0.self_attn.v_proj.weight   ->   [1024, 1024]
model.layers.0.self_attn.v_proj.bias   ->   [1024]
model.layers.0.self_attn.o_proj.weight   ->   [1024, 1024]
model.layers.0.mlp.gate_proj.weight   ->   [2816, 1024]
model.layers.0.mlp.up_proj.weight   ->   [2816, 1024]
model.layers.0.mlp.down_proj.weight   ->   [1024, 2816]
model.layers.0.input_layernorm.weight   ->   [1024]
model.layers.0.post_attention_layernorm.weight   ->   [1024]
model.layers.1.self_attn.q_proj.weight   ->   [1024, 1024]
model.layers.1.self_attn.q_proj.bias   ->   [1024]
model.layers.1.self_attn.k_proj.weight   ->   [1024, 1024]
model.layers.1.self_attn.k_proj.bias   ->   [1024]
model.layers.1.self_attn.v_proj.weight   ->   [1024, 1024]
model.layers.1.self_attn.v_proj.bias   ->   [1024]
model.layers.1.self_attn.o_proj.weight   ->   [1024, 1024]
model.layers.1.mlp.gate_proj.weight   ->   [2816, 1024]
model.layers.1.mlp.up_proj.weight   ->   [2816, 1024]
model.layers.1.mlp.down_proj.weight   ->   [1024, 2816]
model.layers.1.input_layernorm.weight   ->   [1024]
model.layers.1.post_attention_layernorm.weight   ->   [1024]
......
model.layers.23.self_attn.q_proj.weight   ->   [1024, 1024]
model.layers.23.self_attn.q_proj.bias   ->   [1024]
model.layers.23.self_attn.k_proj.weight   ->   [1024, 1024]
model.layers.23.self_attn.k_proj.bias   ->   [1024]
model.layers.23.self_attn.v_proj.weight   ->   [1024, 1024]
model.layers.23.self_attn.v_proj.bias   ->   [1024]
model.layers.23.self_attn.o_proj.weight   ->   [1024, 1024]
model.layers.23.mlp.gate_proj.weight   ->   [2816, 1024]
model.layers.23.mlp.up_proj.weight   ->   [2816, 1024]
model.layers.23.mlp.down_proj.weight   ->   [1024, 2816]
model.layers.23.input_layernorm.weight   ->   [1024]
model.layers.23.post_attention_layernorm.weight   ->   [1024]
model.norm.weight   ->   [1024]
lm_head.weight   ->   [151860, 1024]

model.vision_tower_high.pos_embed   ->   [1, 64, 64, 768]
model.vision_tower_high.patch_embed.proj.weight   ->   [768, 3, 16, 16]
model.vision_tower_high.patch_embed.proj.bias   ->   [768]
model.vision_tower_high.blocks.0.norm1.weight   ->   [768]
model.vision_tower_high.blocks.0.norm1.bias   ->   [768]
model.vision_tower_high.blocks.0.attn.rel_pos_h   ->   [27, 64]
model.vision_tower_high.blocks.0.attn.rel_pos_w   ->   [27, 64]
model.vision_tower_high.blocks.0.attn.qkv.weight   ->   [2304, 768]
model.vision_tower_high.blocks.0.attn.qkv.bias   ->   [2304]
model.vision_tower_high.blocks.0.attn.proj.weight   ->   [768, 768]
model.vision_tower_high.blocks.0.attn.proj.bias   ->   [768]
model.vision_tower_high.blocks.0.norm2.weight   ->   [768]
model.vision_tower_high.blocks.0.norm2.bias   ->   [768]
model.vision_tower_high.blocks.0.mlp.lin1.weight   ->   [3072, 768]
model.vision_tower_high.blocks.0.mlp.lin1.bias   ->   [3072]
model.vision_tower_high.blocks.0.mlp.lin2.weight   ->   [768, 3072]
model.vision_tower_high.blocks.0.mlp.lin2.bias   ->   [768]
......
model.vision_tower_high.blocks.11.norm1.weight   ->   [768]
model.vision_tower_high.blocks.11.norm1.bias   ->   [768]
model.vision_tower_high.blocks.11.attn.rel_pos_h   ->   [127, 64]
model.vision_tower_high.blocks.11.attn.rel_pos_w   ->   [127, 64]
model.vision_tower_high.blocks.11.attn.qkv.weight   ->   [2304, 768]
model.vision_tower_high.blocks.11.attn.qkv.bias   ->   [2304]
model.vision_tower_high.blocks.11.attn.proj.weight   ->   [768, 768]
model.vision_tower_high.blocks.11.attn.proj.bias   ->   [768]
model.vision_tower_high.blocks.11.norm2.weight   ->   [768]
model.vision_tower_high.blocks.11.norm2.bias   ->   [768]
model.vision_tower_high.blocks.11.mlp.lin1.weight   ->   [3072, 768]
model.vision_tower_high.blocks.11.mlp.lin1.bias   ->   [3072]
model.vision_tower_high.blocks.11.mlp.lin2.weight   ->   [768, 3072]
model.vision_tower_high.blocks.11.mlp.lin2.bias   ->   [768]
model.vision_tower_high.neck.0.weight   ->   [256, 768, 1, 1]
model.vision_tower_high.neck.1.weight   ->   [256]
model.vision_tower_high.neck.1.bias   ->   [256]
model.vision_tower_high.neck.2.weight   ->   [256, 256, 3, 3]
model.vision_tower_high.neck.3.weight   ->   [256]
model.vision_tower_high.neck.3.bias   ->   [256]
model.vision_tower_high.net_2.weight   ->   [512, 256, 3, 3]
model.vision_tower_high.net_3.weight   ->   [1024, 512, 3, 3]
model.mm_projector_vary.weight   ->   [1024, 1024]
model.mm_projector_vary.bias   ->   [1024]

Tensor count: 472

定义模型架构

在转换脚本convert_hf_to_gguf.py中定义模型类,继承自Model父类。

@Model.register("GOTQwenForCausalLM")
class GOTOCR2Model(Model):
    model_arch = gguf.MODEL_ARCH.GOT_OCR2

    def set_vocab(self):
        try:
            self._set_vocab_sentencepiece()
        except FileNotFoundError:
            self._set_vocab_qwen()

定义张量布局

class MODEL_ARCH(IntEnum):
    ...
    GOT_OCR2     = auto()


MODEL_ARCH_NAMES: dict[MODEL_ARCH, str] = {
    ...
    MODEL_ARCH.GROK:           "grok",
}


class MODEL_TENSOR(IntEnum):
    TOKEN_EMBD           = auto()
    OUTPUT               = auto()
    OUTPUT_NORM          = auto()
    ATTN_NORM            = auto()
    ATTN_Q               = auto()
    ATTN_K               = auto()
    ATTN_OUT             = auto()
    FFN_NORM             = auto()
    FFN_GATE             = auto()
    FFN_DOWN             = auto()
    FFN_UP               = auto()
    VIS_ATTN_QKV         = auto()
    VIS_ATTN_PROJ        = auto()
    VIS_ATTN_REL_POS_H   = auto()
    VIS_ATTN_REL_POS_W   = auto()
    VIS_MLP_LIN1         = auto()
    VIS_MLP_LIN2         = auto()
    VIS_NORM1            = auto()
    VIS_NORM2            = auto()
    VIS_NECK             = auto()
    VIS_NET              = auto()
    VIS_PATCH_EMBD_PROJ  = auto()
    VIS_POS_EMBD         = auto()
    MM_PROJ              = auto()


MODEL_TENSORS: dict[MODEL_ARCH, list[MODEL_TENSOR]] = {
     MODEL_ARCH.GOT_OCR2: [
        MODEL_TENSOR.TOKEN_EMBD,
        MODEL_TENSOR.OUTPUT_NORM,
        MODEL_TENSOR.OUTPUT,
        MODEL_TENSOR.ATTN_NORM,
        MODEL_TENSOR.ATTN_Q,
        MODEL_TENSOR.ATTN_K,
        MODEL_TENSOR.ATTN_V,
        MODEL_TENSOR.ATTN_OUT,
        MODEL_TENSOR.FFN_NORM,
        MODEL_TENSOR.FFN_GATE,
        MODEL_TENSOR.FFN_DOWN,
        MODEL_TENSOR.FFN_UP,
        MODEL_TENSOR.VIS_ATTN_QKV,
        MODEL_TENSOR.VIS_ATTN_PROJ,
        MODEL_TENSOR.VIS_ATTN_REL_POS_H,
        MODEL_TENSOR.VIS_ATTN_REL_POS_W,
        MODEL_TENSOR.VIS_MLP_LIN1,
        MODEL_TENSOR.VIS_MLP_LIN2,
        MODEL_TENSOR.VIS_NORM1,
        MODEL_TENSOR.VIS_NORM2,
        MODEL_TENSOR.VIS_NECK,
        MODEL_TENSOR.VIS_NET,
        MODEL_TENSOR.VIS_PATCH_EMBD_PROJ,
        MODEL_TENSOR.VIS_POS_EMBD,
        MODEL_TENSOR.MM_PROJ,
    ],
}

TENSOR_NAMES: dict[MODEL_TENSOR, str] = {
    MODEL_TENSOR.TOKEN_EMBD:                "token_embd",
    MODEL_TENSOR.OUTPUT_NORM:               "output_norm",
    MODEL_TENSOR.OUTPUT:                    "output",
    MODEL_TENSOR.ATTN_NORM:                 "blk.{bid}.attn_norm",
    MODEL_TENSOR.ATTN_Q:                    "blk.{bid}.attn_q",
    MODEL_TENSOR.ATTN_K:                    "blk.{bid}.attn_k",
    MODEL_TENSOR.ATTN_V:                    "blk.{bid}.attn_v",
    MODEL_TENSOR.ATTN_OUT:                  "blk.{bid}.attn_output",
    MODEL_TENSOR.FFN_NORM:                  "blk.{bid}.ffn_norm",
    MODEL_TENSOR.FFN_GATE:                  "blk.{bid}.ffn_gate",
    MODEL_TENSOR.FFN_DOWN:                  "blk.{bid}.ffn_down",
    MODEL_TENSOR.FFN_UP:                    "blk.{bid}.ffn_up",
    MODEL_TENSOR.VIS_ATTN_QKV:              "vis.blk.{bid}.attn.qkv",   # standardize tensor name in GGUF
    MODEL_TENSOR.VIS_ATTN_PROJ:             "vis.blk.{bid}.attn.proj",
    MODEL_TENSOR.VIS_ATTN_REL_POS_H:        "vis.blk.{bid}.attn.rel_pos_h",
    MODEL_TENSOR.VIS_ATTN_REL_POS_W:        "vis.blk.{bid}.attn.rel_pos_w",
    MODEL_TENSOR.VIS_MLP_LIN1:              "vis.blk.{bid}.mlp.lin1",
    MODEL_TENSOR.VIS_MLP_LIN2:              "vis.blk.{bid}.mlp.lin2",
    MODEL_TENSOR.VIS_NORM1:                 "vis.blk.{bid}.norm1",
    MODEL_TENSOR.VIS_NORM2:                 "vis.blk.{bid}.norm2",
    MODEL_TENSOR.VIS_NECK:                  "vis.neck.{bid}",
    MODEL_TENSOR.VIS_NET:                   "vis.net_{bid}",
    MODEL_TENSOR.VIS_PATCH_EMBD_PROJ:       "vis_patch_embd.proj",
    MODEL_TENSOR.VIS_POS_EMBD:              "vis_pos_embd",
    MODEL_TENSOR.MM_PROJ:                   "mm_proj",
}

张量映射

将原始张量名称映射到 GGUF 中的标准化等效名称。作为一般规则,在向 GGUF 添加新的张量名称之前,请确保等效命名尚不存在。找到等效的 GGUF 张量名称后,将其添加到tensor_mapping.py 文件中。如果张量名称是重复层/块的一部分,则关键字 bid 替换。

from .constants import MODEL_ARCH, MODEL_TENSOR, MODEL_TENSORS, TENSOR_NAMES

class TensorNameMap:
    mappings_cfg: dict[MODEL_TENSOR, tuple[str, ...]] = {
        # Vision tower
        MODEL_TENSOR.MM_PROJ: (
            "model.mm_projector_vary",  # got
        ),

        MODEL_TENSOR.VIS_PATCH_EMBD_PROJ: (
            "model.vision_tower_high.patch_embed.proj",   # got
        ),
        MODEL_TENSOR.VIS_POS_EMBD: (
            "model.vision_tower_high.pos_embed",    # got
        ),

        # Token embeddings
        MODEL_TENSOR.TOKEN_EMBD: (
            "model.embed_tokens",                        # llama-hf nemotron olmoe got
        ),

        # Output
        MODEL_TENSOR.OUTPUT: (
            "lm_head",                   # gpt2 mpt falcon llama-hf baichuan qwen mamba dbrx jais nemotron exaone olmoe got
        ),

        # Output norm
        MODEL_TENSOR.OUTPUT_NORM: (
            "model.norm",                              # llama-hf baichuan internlm2 olmoe got
        ),

    block_mappings_cfg: dict[MODEL_TENSOR, tuple[str, ...]] = {
        # Attention norm
        MODEL_TENSOR.ATTN_NORM: (
            "model.layers.{bid}.input_layernorm",                   # llama-hf nemotron olmoe got
        ),

        # Attention query
        MODEL_TENSOR.ATTN_Q: (
            "model.layers.{bid}.self_attn.q_proj",                       # llama-hf nemotron olmoe got
        ),

        # Attention key
        MODEL_TENSOR.ATTN_K: (
            "model.layers.{bid}.self_attn.k_proj",                     # llama-hf nemotron olmoe got
        ),

        # Attention value
        MODEL_TENSOR.ATTN_V: (
            "model.layers.{bid}.self_attn.v_proj",                       # llama-hf nemotron olmoe got
        ),

        # Attention output
        MODEL_TENSOR.ATTN_OUT: (
            "model.layers.{bid}.self_attn.o_proj",                          # llama-hf nemotron olmoe got
        ),

        # Attention output norm
        MODEL_TENSOR.ATTN_OUT_NORM: (
            "encoder.layer.{bid}.attention.output.LayerNorm",  # bert
            "encoder.layers.{bid}.norm1",                      # nomic-bert
            "transformer.decoder_layer.{bid}.rms_norm_1",      # Grok
            "transformer.blocks.{bid}.norm_attn_norm.norm_2",  # dbrx
        ),

        # Feed-forward norm
        MODEL_TENSOR.FFN_NORM: (
            "model.layers.{bid}.post_attention_layernorm",                   # llama-hf nemotron olmoe got
        ),

        # Feed-forward up
        MODEL_TENSOR.FFN_UP: (
            "model.layers.{bid}.mlp.up_proj",                         # llama-hf refact nemotron got
        ),

        # Feed-forward gate
        MODEL_TENSOR.FFN_GATE: (
            "model.layers.{bid}.mlp.gate_proj",           # llama-hf refact got
        ),

        # Feed-forward down
        MODEL_TENSOR.FFN_DOWN: (
            "model.layers.{bid}.mlp.down_proj",                       # llama-hf nemotron got
        ),

        MODEL_TENSOR.VIS_ATTN_QKV: (
            "model.vision_tower_high.blocks.{bid}.attn.qkv",  # got
        ),

        MODEL_TENSOR.VIS_ATTN_PROJ: (
            "model.vision_tower_high.blocks.{bid}.attn.proj",  # got
        ),

        MODEL_TENSOR.VIS_ATTN_REL_POS_H: (
            "model.vision_tower_high.blocks.{bid}.attn.rel_pos_h",  # got
        ),

        MODEL_TENSOR.VIS_ATTN_REL_POS_W: (
            "model.vision_tower_high.blocks.{bid}.attn.rel_pos_w",  # got
        ),

        MODEL_TENSOR.VIS_MLP_LIN1: (
            "model.vision_tower_high.blocks.{bid}.mlp.lin1",  # got
        ),

        MODEL_TENSOR.VIS_MLP_LIN2: (
            "model.vision_tower_high.blocks.{bid}.mlp.lin2",  # got
        ),

        MODEL_TENSOR.VIS_NORM1: (
            "model.vision_tower_high.blocks.{bid}.norm1",  # got
        ),

        MODEL_TENSOR.VIS_NORM2: (
            "model.vision_tower_high.blocks.{bid}.norm2",  # got
        ),

        MODEL_TENSOR.VIS_NECK: (
            "model.vision_tower_high.neck.{bid}",  # got
        ),

        MODEL_TENSOR.VIS_NET: (                 
            "model.vision_tower_high.net_{bid}",  # got
        ),  
    }

基于模型的配置,tokenizer,,代码和张量布局,可能需要覆盖重写一下类方法:

  • Model.set_gguf_parameters
  • Model.set_vocab
  • Model.write_tensors

具体在第3章定义模型架构中实现。

注意:张量名称必须以 .weight 后缀结尾, quantize量化工具会默认处理权重。

模型参数转换

python convert_hf_to_gguf.py --outtype bf16 --model ~/GOT-OCR2_0 --outfile ~/output/GOT-OCR2_0-GGUF

实现

https://github.com/jerrylsu/gguf-py

致谢🙏

ggml: Tensor library for machine learning.

llama.cpp: LLM inference in C/C++.

GOT-OCR2.0: Official code implementation of General OCR Theory: Towards OCR-2.0 via a Unified End-to-end Model.

Share on: TwitterFacebookEmail

Comments


Related Posts


Published

Category

NLP

Tags

Contact