Copyright © 2026 Authors retain the copyright of this article. This article is an open access article distributed under the Creative Commons Attribution License which permits unrestricted use, distribution, and reproduction in any medium, provided the original work is properly cited.
@article{182290,
author = {DEEPIKA A B},
title = {Multilingual Text-to-Image Generation: A Cross Lingual Synthesis Framework},
journal = {International Journal of Innovative Research in Technology},
year = {2025},
volume = {12},
number = {2},
pages = {1453-1459},
issn = {2349-6002},
url = {https://ijirt.org/article?manuscript=182290},
abstract = {This research investigates the challenges posed by the predominant focus on English language text-to-image generation (TTI) because of the lack of annotated image caption data in other languages. The resulting inequitable access to TTI technology in non-English-speaking regions motivates the research of multilingual TTI (mTTI) and the potential of neural machine translation (NMT) to facilitate its development. The study presents two main contributions. Firstly, a systematic empirical study employing a multilingual multi-modal encoder evaluates standard cross-lingual NLP methods applied to mTTI, including TRANSLATE TRAIN, TRANSLATE TEST, and ZERO-SHOT TRANSFER. Secondly, a novel parameter-efficient approach called Ensemble Adapter (ENSAD) is introduced, leveraging multilingual text knowledge within the mTTI framework to avoid the language gap and enhance mTTI performance. Additionally, the research addresses challenges associated with transformer-based TTI models, such as slow generation and complexity for high-resolution images. It proposes hierarchical transformers and local parallel autoregressive generation techniques to overcome these limitations. A 6B-parameter transformer pretrained with a cross-modal general language model (CogLM) and fine-tuned for fast super-resolution results in a new text-to-image system, denoted as It, which demonstrates competitive performance compared to the state-of-the-art DALL-E-2. Furthermore, It supports interactive text-guided editing on images, offering a versatile and efficient solution for text-to-image generation.},
keywords = {Text-to-image generation, Multilingual TTI (mTTI), Neural machine translation (NMT), Cross-lingual NLP, Ensemble Adapter (ENSAD), Hierarchical transformers, Super- resolution, Transformer-based models, Cross-modal general language model (CogLM).},
month = {July},
}
Submit your research paper and those of your network (friends, colleagues, or peers) through your IPN account, and receive 800 INR for each paper that gets published.
Join NowNational Conference on Sustainable Engineering and Management - 2024 Last Date: 15th March 2024
Submit inquiry