from transformers import BlipProcessor, BlipTextConfig
from transformers.models.blip.modeling_blip_text import BlipTextLMHeadModel
from transformers import AutoTokenizer
model = BlipForConditionalGeneration.from_pretrained("huggingface.co/Salesforce/blip-image-captioning-base")
实际训练的时候就可以用BERT的tokenizer处理encode和decode
bertTokenizer