Copyright © 2026 Authors retain the copyright of this article. This article is an open access article distributed under the Creative Commons Attribution License which permits unrestricted use, distribution, and reproduction in any medium, provided the original work is properly cited.
@article{206019,
author = {Nandha Kumar U and Ms. S. JeyaLakshmi},
title = {Real-Time Conversational Voice Agent Using Large Language Models},
journal = {International Journal of Innovative Research in Technology},
year = {2026},
volume = {13},
number = {1},
pages = {8846-8859},
issn = {2349-6002},
url = {https://ijirt.org/article?manuscript=206019},
abstract = {Voice interfaces have moved from scripted, intent-based assistants toward open-ended conversational systems built on large language models, yet most reported implementations still treat the problem as a simple chain of speech-to-text, language generation, and text-to-speech, run sequentially with little regard for latency or context retention. This paper presents a real-time conversational voice agent built around an Adaptive Conversation Orchestrator, a coordinating layer that decides, on a turn-by-turn basis, when to retrieve external knowledge, when to summarize conversation history, how to react to a user interruption, and how to allocate a limited latency budget across speech recognition, retrieval, and generation. The orchestrator sits above a Hierarchical Context Memory consisting of an active turn buffer, a summarized session history, and a long-term retrieval index, and selects which of these layers to consult depending on the nature of the incoming utterance, rather than appending the full conversation to every prompt. Knowledge grounding is handled through a Context-Aware Dynamic Retrieval module that routes a query through one of several retrieval paths, ranging from no retrieval for conversational small talk to chunk-level reranked search for questions touching large ingested documents, so that retrieval cost is incurred only when it is likely to improve the answer. Timing across the pipeline is governed by a Response Budget Manager that assigns soft deadlines to speech recognition, memory lookup, vector search, and first-token generation, allowing downstream stages to begin consuming partial output from upstream stages rather than waiting for full completion. The system was implemented with a FastAPI backend communicating over WebSockets with a Next.js frontend, using Whisper-based streaming transcription, ChromaDB for vector storage, and a large language model accessed through a Gemini-compatible API. The prototype was evaluated across conversational and document-grounded scenarios, measuring first-token latency, speech start latency, retrieval accuracy, transcription error rate, and session stability. The results suggest that orchestrating these components around an explicit latency budget, rather than chaining them naively, allows the system to maintain natural turn-taking and contextual continuity while keeping end-to-end response time within a range suitable for live conversation.},
keywords = {Conversational AI, Large Language Models, Adaptive Conversation Orchestration, Retrieval-Augmented Generation, Hierarchical Context Memory, Voice Activity Detection, Real-Time Speech Processing, Latency Budgeting},
month = {June},
}
Submit your research paper and those of your network (friends, colleagues, or peers) through your IPN account, and receive 800 INR for each paper that gets published.
Join NowNational Conference on Sustainable Engineering and Management - 2024 Last Date: 15th March 2024
Submit inquiry