Copyright © 2025 Authors retain the copyright of this article. This article is an open access article distributed under the Creative Commons Attribution License which permits unrestricted use, distribution, and reproduction in any medium, provided the original work is properly cited.
@article{184983, author = {Dr V Subrahmanyam and Dr M. V. Siva Prasad}, title = {Language-Guided 3D Segmentation for Robotics and AR Applications}, journal = {International Journal of Innovative Research in Technology}, year = {2025}, volume = {12}, number = {4}, pages = {4300-4306}, issn = {2349-6002}, url = {https://ijirt.org/article?manuscript=184983}, abstract = {Language-guided 3D segmentation combines natural language understanding with geometric scene interpretation to enable intuitive, flexible, and task-oriented scene parsing for robotics and augmented reality (AR). We propose a unified framework, Lang3D-Seg, which aligns textual instructions and free-form language queries with 3D volumetric and point-based representations to produce accurate semantic and instance segmentations. Lang3D-Seg uses a cross-modal transformer backbone that ingests multi-view RGB images, point clouds, and language tokens; it leverages a joint positional encoding scheme and a novel Text-Conditioned Graph Propagation (TCGP) module to refine segmentation masks in 3D. We evaluate Lang3D- Seg on benchmarks synthesized from ScanNet-style indoor scenes and a new robotics-focused dataset (RG3D) containing command-driven segmentation tasks and real robot interaction traces. Our approach significantly improves zero-shot and few-shot language-conditioned segmentation performance compared to baselines, reduces inference latency suitable for real- time robotics, and demonstrates robust generalization to unseen environments and compositional language queries. We show downstream utility through two application case studies: (1) goal-oriented object manipulation in a mobile manipulator and (2) contextual AR annotation and selective occlusion in a head-mounted display. We release code, trained models, and RG3D dataset splits to facilitate follow-up research.}, keywords = {Language-guided 3D segmentation, Robotics, Augmented Reality, Multimodal learning, Cross-modal transformer, point cloud, Text-conditioned graph propagation, Human- robot interaction, Scene understanding, Referring expressions}, month = {September}, }
Cite This Article
Submit your research paper and those of your network (friends, colleagues, or peers) through your IPN account, and receive 800 INR for each paper that gets published.
Join NowNational Conference on Sustainable Engineering and Management - 2024 Last Date: 15th March 2024
Submit inquiry