Copyright © 2025 Authors retain the copyright of this article. This article is an open access article distributed under the Creative Commons Attribution License which permits unrestricted use, distribution, and reproduction in any medium, provided the original work is properly cited.
@article{188676,
author = {Priyabrata Das Adhikari and Dr. Sakshi Kathuria},
title = {Visual Language Models (VLMs) with Contextual Analysis of SimVLM},
journal = {International Journal of Innovative Research in Technology},
year = {2025},
volume = {12},
number = {7},
pages = {2860-2868},
issn = {2349-6002},
url = {https://ijirt.org/article?manuscript=188676},
abstract = {Visual Language models (VLMs) have grown to be critical to multimodal synthetic intelligence via allowing unified information of snap shots and text. They strengthen present day applications which include visible query answering, caption generation, picture-textual content retrieval, and multimodal reasoning. A pivotal development in this evolution was SimVLM, which added a minimalist pretraining framework based totally on weakly aligned picture-textual content pairs and a unified PrefixLM objective. This simplified approach proved that massive-scale net statistics and quit-to-stop training can outperform complex multi-goal pipelines. On this evaluation, we role SimVLM inside the broader development of VLMs, surveying architectural designs, training methodologies, downstream packages, strengths, boundaries, and emerging tendencies. We examine key challenges consisting of robustness, reasoning, bias, scalability, and evaluation gaps and pick out promising destiny guidelines which include efficient VLMs, multimodal foundation models, domain-specialized structure, and safety-centered multimodal AI. This evaluation offers a consolidated angle of the sphere even as highlighting SimVLM’s lasting contributions to the evolution of imaginative and prescient-language intelligence.},
keywords = {visual Language models; SimVLM; Multimodal AI; visual Language Pretraining; Transformer models; vulnerable Supervision; PrefixLM; Multimodal Reasoning; Adapter Tuning; efficient VLMs.},
month = {December},
}
Cite This Article
Submit your research paper and those of your network (friends, colleagues, or peers) through your IPN account, and receive 800 INR for each paper that gets published.
Join NowNational Conference on Sustainable Engineering and Management - 2024 Last Date: 15th March 2024
Submit inquiry