Copyright © 2026 Authors retain the copyright of this article. This article is an open access article distributed under the Creative Commons Attribution License which permits unrestricted use, distribution, and reproduction in any medium, provided the original work is properly cited.
@article{174093,
author = {G. BHARGAVI and AGADDA VEERA BRAHMA SIVA RAMA KRISHNA and D. TRINADH and VIJAY CHIKKALA and BOTCHA SAI CHARAN},
title = {VOCASYNTH – A UNIVERSAL VOICE COMPANION},
journal = {International Journal of Innovative Research in Technology},
year = {2025},
volume = {11},
number = {10},
pages = {2833-2841},
issn = {2349-6002},
url = {https://ijirt.org/article?manuscript=174093},
abstract = {This project enhances human-computer interaction by integrating speech recognition, AI-generated responses, and document summarization within a streamlined Streamlit interface. It enables users to communicate with AI through voice input while also facilitating document-based information retrieval. The speech-based interaction utilizes SpeechRecognition to convert spoken words into text, which is then processed by Google Generative AI (Gemini-1.5-flash) to generate relevant and context-aware responses. The responses are displayed in a chat interface and converted back into speech using pyttsx3 and gTTS, providing an immersive conversational experience. A queue-based processing mechanism ensures efficient voice output handling, preventing overlap in speech generation. Users can also adjust voice settings for a more personalized experience. For document summarization, users can upload files in TXT, PDF, or DOCX formats. The text is extracted using PyPDF2 and python-docx, processed with pandas, and summarized into a concise version. The system employs chunk-based processing to handle long documents, ensuring accurate and meaningful summaries while maintaining coherence. The generated summaries help users quickly grasp key points from large texts. The application supports multiple languages, including English, Hindi, and Telugu, making it accessible to a diverse user base. Users can save and download chat histories for future reference, enhancing usability. By integrating voice-based AI interaction and automated summarization, the system significantly improves efficiency in accessing and processing information, benefiting researchers, students, professionals, and customer support applications.},
keywords = {Speech Recognition, Google Generative AI (Gemini-1.5-flash), Streamlit, pyttsx3, gTTS, voice-based interaction, text-to-speech (TTS), speech-to-text (STT), queue-based processing, conversational AI, chunk-based text processing, multilingual support, chat history storage, real-time AI responses, personalized voice settings, information retrieval.},
month = {March},
}
Submit your research paper and those of your network (friends, colleagues, or peers) through your IPN account, and receive 800 INR for each paper that gets published.
Join NowNational Conference on Sustainable Engineering and Management - 2024 Last Date: 15th March 2024
Submit inquiry