Copyright © 2026 Authors retain the copyright of this article. This article is an open access article distributed under the Creative Commons Attribution License which permits unrestricted use, distribution, and reproduction in any medium, provided the original work is properly cited.
@article{186831,
author = {Roshan Patel and Darshan Patil and Sanskar Patil and Tanuja Patange and Dnyanesh Patil and Srinivas Patil and Faaiz Khan and Prof. Vaishali Patil},
title = {Evaluating Atom of Thoughts Across Diverse Language Models: A Framework For Enhancing Non-reasoning LLMs performance},
journal = {International Journal of Innovative Research in Technology},
year = {2025},
volume = {12},
number = {6},
pages = {2462-2466},
issn = {2349-6002},
url = {https://ijirt.org/article?manuscript=186831},
abstract = {Large Language Models (LLMs) have shown remarkable progress, yet enhancing their complex reasoning capabilities, especially during inference (test-time), remains a challenge. Traditional methods often struggle with computational overhead or fail to optimally guide the reasoning process. Atom of Thoughts (AoT) was recently proposed as a novel test-time scaling technique that models reasoning as a Markov process of atomic questions, aiming to improve efficiency and focus. This paper presents the implementation, extension, and critical evaluation of the AoT framework. We developed a comprehensive system featuring multi-model support (integrating OpenAI, Gemini, and OpenRouter models), a user-friendly web interface for experiment configuration and execution, and extended capabilities for mathematical reasoning tasks. Our evaluation across various benchmarks confirms that AoT can enhance the performance of smaller or non-reasoning-focused LLMs. However, our most significant finding reveals a counter-intuitive trend: AoT often degrades the performance of several state-of-the-art reasoning models (e.g., DeepSeek R1, Grok 3, GPT o3-mini). We hypothesize this is due to interference with their specialized internal reasoning mechanisms, potentially involving planning and differing internal vs. external thought processes, which conflicts with AoT's structured decomposition prompts.
Notably, highly instruction-following models like Gemini 2.5 Pro Thinking showed minimal performance change, suggesting instruction adherence mitigates this negative interaction. This work provides a practical AoT implementation and offers crucial insights into the interaction between structured reasoning frameworks and the non-reasoning models.},
keywords = {Atom of Thoughts, LLM Reasoning, Test-Time Scaling, Large Language Models, Model Evaluation, Reasoning Models, Web Interface, Instruction Following, Gemini, OpenRouter, DeepSeek.},
month = {November},
}
Submit your research paper and those of your network (friends, colleagues, or peers) through your IPN account, and receive 800 INR for each paper that gets published.
Join NowNational Conference on Sustainable Engineering and Management - 2024 Last Date: 15th March 2024
Submit inquiry