Copyright © 2026 Authors retain the copyright of this article. This article is an open access article distributed under the Creative Commons Attribution License which permits unrestricted use, distribution, and reproduction in any medium, provided the original work is properly cited.
@article{176719,
author = {Smita Wagh and Hema Sachin Bahl and Sakshi Annaso Sutar and Bhagyashri Prakash Yerawar},
title = {AI Story Craft: A Platform for collaborative storytelling, blending human creativity with AI driven text and visuals to craft engaging narratives},
journal = {International Journal of Innovative Research in Technology},
year = {2025},
volume = {11},
number = {11},
pages = {6137-6141},
issn = {2349-6002},
url = {https://ijirt.org/article?manuscript=176719},
abstract = {The rapid advancements in both image generation and open-form text generation have opened new avenues for creating interleaved image-text content. This paper focuses on multimodal story generation, an innovative task that integrates narrative text with rich visual elements in a cohesive manner. While promising, this end story presents substantial challenges, particularly in understanding the intricate relationship between text and images, as well as in generating extended sequences of coherent, contextually relevant narratives and visuals. We introduce Story Teller, a groundbreaking approach that harnesses a Multimodal Large Language Model (MLLM) to create comprehensive multimodal stories. Our model excels in predicting both text and visual tokens, employing an adapted visual de tokenizer to generate images that maintain character consistency and stylistic coherence. We also introduce a novel multimodal attention sink mechanism that facilitates the efficient generation of stories with up to 25 interleaved sequences, surpassing the training limit of 10. To support our model and evaluate the multimodal story generation task, we present StoryStream, a large-scale, high-resolution dataset designed for comprehensive training and quantitative analysis. This work aims to advance the state of multimodal storytelling, offering insights and tools for future research in this dynamic field.},
keywords = {MLLM, Vit, SD-XL, Detokenizer, Tokenizer, Diffusion Model, Story-Teller},
month = {April},
}
Submit your research paper and those of your network (friends, colleagues, or peers) through your IPN account, and receive 800 INR for each paper that gets published.
Join NowNational Conference on Sustainable Engineering and Management - 2024 Last Date: 15th March 2024
Submit inquiry