Copyright © 2025 Authors retain the copyright of this article. This article is an open access article distributed under the Creative Commons Attribution License which permits unrestricted use, distribution, and reproduction in any medium, provided the original work is properly cited.
@article{180293, author = {Afzal Ahmad Azmi and Prof. Anurag Srivastava}, title = {Deep Learning Approaches to Sentiment Analysis on Text, Visual, and Audio Modalities: A Review}, journal = {International Journal of Innovative Research in Technology}, year = {2025}, volume = {12}, number = {1}, pages = {1406-1412}, issn = {2349-6002}, url = {https://ijirt.org/article?manuscript=180293}, abstract = {Sentiment analysis has rapidly evolved beyond text-only approaches to embrace the rich information contained in images and audio, giving rise to multimodal sentiment analysis. This survey provides a comprehensive review of deep learning methods applied to sentiment classification across text, visual, and audio modalities. We first examine modality-specific encoders: transformer-based and recurrent networks (e.g., BERT, BiLSTM) for textual sentiment, convolutional and vision transformer models (e.g., ResNet, ViT) for image-based emotion recognition, and convolutional–recurrent architectures for audio signals using spectrogram and MFCC features. Next, we analyze fusion strategies—early, late, and hybrid fusion—that integrate modality representations, highlighting the role of attention mechanisms and multimodal transformers (e.g., MMT) in dynamically weighting cross-modal interactions. Benchmark datasets such as CMU-MOSI, CMU-MOSEI, IEMOCAP, and MELD are surveyed, along with evaluation metrics including accuracy, F1-score, and concordance correlation coefficient. We discuss practical applications in social media monitoring, customer feedback analysis, and human–computer interaction. Key challenges such as data imbalance, modality synchronization, domain adaptation, and model interpretability are addressed, alongside proposed solutions like data augmentation, adversarial training, and self-supervised pretraining. Finally, we outline future research directions, including lightweight architectures for edge deployment, advanced fusion techniques, and explainable multimodal sentiment frameworks. By synthesizing recent advances, this survey serves as a roadmap for researchers developing robust and scalable multimodal sentiment analysis systems.}, keywords = {Sentiment Analysis, Deep Learning, Natural Language Processing (NLP), BERT, Text Classification}, month = {June}, }
Cite This Article
Submit your research paper and those of your network (friends, colleagues, or peers) through your IPN account, and receive 800 INR for each paper that gets published.
Join NowNational Conference on Sustainable Engineering and Management - 2024 Last Date: 15th March 2024
Submit inquiry