Implementation of Vision Language Model (paligemma).
Use conda or uv to install dependencies:
uv sync --extra {cpu,gpu}The weights are hosted on Hugging Face. Download them with:
git lfs install
git clone https://huggingface.co/google/paligemma-3b-pt-224 ~/workspace/model_weights/paligemma-3b-pt-224Set MODEL_PATH in launch_inference.sh to the downloaded directory.
@misc{beyer2024paligemmaversatile3bvlm,
title={PaliGemma: A versatile 3B VLM for transfer},
author={Lucas Beyer and Andreas Steiner and André Susano Pinto and Alexander Kolesnikov and Xiao Wang and Daniel Salz and Maxim Neumann and Ibrahim Alabdulmohsin and Michael Tschannen and Emanuele Bugliarello and Thomas Unterthiner and Daniel Keysers and Skanda Koppula and Fangyu Liu and Adam Grycner and Alexey Gritsenko and Neil Houlsby and Manoj Kumar and Keran Rong and Julian Eisenschlos and Rishabh Kabra and Matthias Bauer and Matko Bošnjak and Xi Chen and Matthias Minderer and Paul Voigtlaender and Ioana Bica and Ivana Balazevic and Joan Puigcerver and Pinelopi Papalampidi and Olivier Henaff and Xi Xiong and Radu Soricut and Jeremiah Harmsen and Xiaohua Zhai},
year={2024},
eprint={2407.07726},
archivePrefix={arXiv},
primaryClass={cs.CV},
url={https://arxiv.org/abs/2407.07726},
}@misc{zhai2023sigmoidlosslanguageimage,
title={Sigmoid Loss for Language Image Pre-Training},
author={Xiaohua Zhai and Basil Mustafa and Alexander Kolesnikov and Lucas Beyer},
year={2023},
eprint={2303.15343},
archivePrefix={arXiv},
primaryClass={cs.CV},
url={https://arxiv.org/abs/2303.15343},
}@misc{vaswani2023attentionneed,
title={Attention Is All You Need},
author={Ashish Vaswani and Noam Shazeer and Niki Parmar and Jakob Uszkoreit and Llion Jones and Aidan N. Gomez and Lukasz Kaiser and Illia Polosukhin},
year={2023},
eprint={1706.03762},
archivePrefix={arXiv},
primaryClass={cs.CL},
url={https://arxiv.org/abs/1706.03762},
}@misc{dosovitskiy2021imageworth16x16words,
title={An Image is Worth 16x16 Words: Transformers for Image Recognition at Scale},
author={Alexey Dosovitskiy and Lucas Beyer and Alexander Kolesnikov and Dirk Weissenborn and Xiaohua Zhai and Thomas Unterthiner and Mostafa Dehghani and Matthias Minderer and Georg Heigold and Sylvain Gelly and Jakob Uszkoreit and Neil Houlsby},
year={2021},
eprint={2010.11929},
archivePrefix={arXiv},
primaryClass={cs.CV},
url={https://arxiv.org/abs/2010.11929},
}@misc{zhang2019rootmeansquarelayer,
title={Root Mean Square Layer Normalization},
author={Biao Zhang and Rico Sennrich},
year={2019},
eprint={1910.07467},
archivePrefix={arXiv},
primaryClass={cs.LG},
url={https://arxiv.org/abs/1910.07467},
}@misc{ba2016layernormalization,
title={Layer Normalization},
author={Jimmy Lei Ba and Jamie Ryan Kiros and Geoffrey E. Hinton},
year={2016},
eprint={1607.06450},
archivePrefix={arXiv},
primaryClass={stat.ML},
url={https://arxiv.org/abs/1607.06450},
}@misc{shazeer2019fasttransformerdecodingwritehead,
title={Fast Transformer Decoding: One Write-Head is All You Need},
author={Noam Shazeer},
year={2019},
eprint={1911.02150},
archivePrefix={arXiv},
primaryClass={cs.NE},
url={https://arxiv.org/abs/1911.02150},
}@misc{ainslie2023gqatraininggeneralizedmultiquery,
title={GQA: Training Generalized Multi-Query Transformer Models from Multi-Head Checkpoints},
author={Joshua Ainslie and James Lee-Thorp and Michiel de Jong and Yury Zemlyanskiy and Federico Lebrón and Sumit Sanghai},
year={2023},
eprint={2305.13245},
archivePrefix={arXiv},
primaryClass={cs.CL},
url={https://arxiv.org/abs/2305.13245},
}@misc{su2023roformerenhancedtransformerrotary,
title={RoFormer: Enhanced Transformer with Rotary Position Embedding},
author={Jianlin Su and Yu Lu and Shengfeng Pan and Ahmed Murtadha and Bo Wen and Yunfeng Liu},
year={2023},
eprint={2104.09864},
archivePrefix={arXiv},
primaryClass={cs.CL},
url={https://arxiv.org/abs/2104.09864},
}