Copyright © 2025 Authors retain the copyright of this article. This article is an open access article distributed under the Creative Commons Attribution License which permits unrestricted use, distribution, and reproduction in any medium, provided the original work is properly cited.
@article{169564,
author = {Swarda Jangam and Samruddhi Patil and Tejaswini Mali},
title = {Image Caption Generator with Multilingual Captioning.},
journal = {International Journal of Innovative Research in Technology},
year = {2024},
volume = {11},
number = {6},
pages = {3310-3313},
issn = {2349-6002},
url = {https://ijirt.org/article?manuscript=169564},
abstract = {This paper presents an AI-based image caption generator designed to automatically describe the contents of an image in multiple languages. The new feature we are adding to it is we can add more than one image which can correlate them and get the caption in paragraph in multiple language. The model utilizes advanced deep learning techniques, including convolutional neural networks (CNNs) for image processing and recurrent neural networks (RNNs) such as Long Short Term Memory (LSTM) units for sequence generation. The multilingual capability is enabled through pre-trained language models that translate the generated captions into multiple languages. The system demonstrates high accuracy in capturing image content and fluency in generating captions across different languages. Potential applications include content accessibility, automatic translation services, and cross-cultural communication. The CNN extracts visual features from the input image, while the RNN, conditioned on these features, generates a sequence of words forming the caption. To enable multilingual captioning, the model incorporates a language-specific module that translates the generated captions into the desired target language. This module is trained on a large bilingual image-caption dataset, aligning visual and textual information across languages. Experimental results demonstrate the effectiveness of the proposed model, achieving state-of-the-art performance on various benchmark datasets. This research contributes to the advancement of multimodal learning and opens up new possibilities for applications such as image search, accessibility tools, and cross-cultural communication.},
keywords = {Image Captioning, Multilingual, Deep Learning, CNN, RNN, LSTM, NLP, Machine Translation, Attention Mechanisms, Machine Translation, Accessibility, Cross- Cultural Communication, Multilingual NLP Models, Storytelling, Image Correlation.},
month = {November},
}
Cite This Article
Submit your research paper and those of your network (friends, colleagues, or peers) through your IPN account, and receive 800 INR for each paper that gets published.
Join NowNational Conference on Sustainable Engineering and Management - 2024 Last Date: 15th March 2024
Submit inquiry