Paper List
Return a paginated listing of all papers.
GET /api/v1/papers/?page=479&q=Large+Language+Models
https://paperswithcode.com/api/v1/papers/?page=480&q=Large+Language+Models", "previous": "https://paperswithcode.com/api/v1/papers/?page=478&q=Large+Language+Models", "results": [ { "id": "explainable-xr-understanding-user-behaviors", "arxiv_id": "2501.13778", "nips_id": null, "url_abs": "https://arxiv.org/abs/2501.13778v1", "url_pdf": "https://arxiv.org/pdf/2501.13778v1.pdf", "title": "Explainable XR: Understanding User Behaviors of XR Environments using LLM-assisted Analytics Framework", "abstract": "We present Explainable XR, an end-to-end framework for analyzing user behavior in diverse eXtended Reality (XR) environments by leveraging Large Language Models (LLMs) for data interpretation assistance. Existing XR user analytics frameworks face challenges in handling cross-virtuality - AR, VR, MR - transitions, multi-user collaborative application scenarios, and the complexity of multimodal data. Explainable XR addresses these challenges by providing a virtuality-agnostic solution for the collection, analysis, and visualization of immersive sessions. We propose three main components in our framework: (1) A novel user data recording schema, called User Action Descriptor (UAD), that can capture the users' multimodal actions, along with their intents and the contexts; (2) a platform-agnostic XR session recorder, and (3) a visual analytics interface that offers LLM-assisted insights tailored to the analysts' perspectives, facilitating the exploration and analysis of the recorded XR session data. We demonstrate the versatility of Explainable XR by demonstrating five use-case scenarios, in both individual and collaborative XR applications across virtualities. Our technical evaluation and user studies show that Explainable XR provides a highly usable analytics solution for understanding user actions and delivering multifaceted, actionable insights into user behaviors in immersive environments.", "authors": [ "Arie E. Kaufman", "Klaus Mueller", "Saeed Boorboor", "Mithilesh Singh", "Zainab Aamir", "Yoonsang Kim" ], "published": "2025-01-23", "conference": null, "conference_url_abs": null, "conference_url_pdf": null, "proceeding": null }, { "id": "do-large-language-models-truly-understand", "arxiv_id": "2501.13773", "nips_id": null, "url_abs": "https://arxiv.org/abs/2501.13773v1", "url_pdf": "https://arxiv.org/pdf/2501.13773v1.pdf", "title": "Do Large Language Models Truly Understand Geometric Structures?", "abstract": "Geometric ability is a significant challenge for large language models (LLMs) due to the need for advanced spatial comprehension and abstract thinking. Existing datasets primarily evaluate LLMs on their final answers, but they cannot truly measure their true understanding of geometric structures, as LLMs can arrive at correct answers by coincidence. To fill this gap, we introduce the GeomRel dataset, designed to evaluate LLMs' understanding of geometric structures by isolating the core step of geometric relationship identification in problem-solving. Using this benchmark, we conduct thorough evaluations of diverse LLMs and identify key limitations in understanding geometric structures. We further propose the Geometry Chain-of-Thought (GeoCoT) method, which enhances LLMs' ability to identify geometric relationships, resulting in significant performance improvements.", "authors": [ "Rui Wang", "Wenhong Zhu", "Yiming Wang", "XiaoFeng Wang" ], "published": "2025-01-23", "conference": null, "conference_url_abs": null, "conference_url_pdf": null, "proceeding": null }, { "id": "rpo-retrieval-preference-optimization-for", "arxiv_id": "2501.13726", "nips_id": null, "url_abs": "https://arxiv.org/abs/2501.13726v1", "url_pdf": "https://arxiv.org/pdf/2501.13726v1.pdf", "title": "RPO: Retrieval Preference Optimization for Robust Retrieval-Augmented Generation", "abstract": "While Retrieval-Augmented Generation (RAG) has exhibited promise in utilizing external knowledge, its generation process heavily depends on the quality and accuracy of the retrieved context. Large language models (LLMs) struggle to evaluate the correctness of non-parametric knowledge retrieved externally when it differs from internal memorization, leading to knowledge conflicts during response generation. To this end, we introduce the Retrieval Preference Optimization (RPO), a lightweight and effective alignment method to adaptively leverage multi-source knowledge based on retrieval relevance. An implicit representation of retrieval relevance is derived and incorporated into the reward model to integrate retrieval evaluation and response generation into a single model, solving the problem that previous methods necessitate the additional procedure to assess the retrieval quality. Notably, RPO is the only RAG-dedicated alignment approach that quantifies the awareness of retrieval relevance in training, overcoming mathematical obstacles. Experiments on four datasets demonstrate that RPO outperforms RAG by 4-10% in accuracy without any extra component, exhibiting its robust generalization.", "authors": [ "Zhen-Hua Ling", "Shi-Qi Yan" ], "published": "2025-01-23", "conference": null, "conference_url_abs": null, "conference_url_pdf": null, "proceeding": null }, { "id": "di-bench-benchmarking-large-language-models", "arxiv_id": "2501.13699", "nips_id": null, "url_abs": "https://arxiv.org/abs/2501.13699v1", "url_pdf": "https://arxiv.org/pdf/2501.13699v1.pdf", "title": "DI-BENCH: Benchmarking Large Language Models on Dependency Inference with Testable Repositories at Scale", "abstract": "Large Language Models have advanced automated software development, however, it remains a challenge to correctly infer dependencies, namely, identifying the internal components and external packages required for a repository to successfully run. Existing studies highlight that dependency-related issues cause over 40\\% of observed runtime errors on the generated repository. To address this, we introduce DI-BENCH, a large-scale benchmark and evaluation framework specifically designed to assess LLMs' capability on dependency inference. The benchmark features 581 repositories with testing environments across Python, C#, Rust, and JavaScript. Extensive experiments with textual and execution-based metrics reveal that the current best-performing model achieves only a 42.9% execution pass rate, indicating significant room for improvement. DI-BENCH establishes a new viewpoint for evaluating LLM performance on repositories, paving the way for more robust end-to-end software synthesis.", "authors": [ "Qi Zhang", "Dongmei Zhang", "Saravan Rajmohan", "Yingnong Dang", "QIngwei Lin", "Elsie Nallipogu", "Yufan Huang", "Maoquan Wang", "Chengxing Xie", "Jiaheng Wen", "Bowen Li", "Yu Kang", "Chaoyun Zhang", "Shilin He", "Junhao Wang", "Linghao Zhang" ], "published": "2025-01-23", "conference": null, "conference_url_abs": null, "conference_url_pdf": null, "proceeding": null }, { "id": "lvpruning-an-effective-yet-simple-language", "arxiv_id": "2501.13652", "nips_id": null, "url_abs": "https://arxiv.org/abs/2501.13652v1", "url_pdf": "https://arxiv.org/pdf/2501.13652v1.pdf", "title": "LVPruning: An Effective yet Simple Language-Guided Vision Token Pruning Approach for Multi-modal Large Language Models", "abstract": "Multi-modal Large Language Models (MLLMs) have achieved remarkable success by integrating visual and textual modalities. However, they incur significant computational overhead due to the large number of vision tokens processed, limiting their practicality in resource-constrained environments. We introduce Language-Guided Vision Token Pruning (LVPruning) for MLLMs, an effective yet simple method that significantly reduces the computational burden while preserving model performance. LVPruning employs cross-attention modules to compute the importance of vision tokens based on their interaction with language tokens, determining which to prune. Importantly, LVPruning can be integrated without modifying the original MLLM parameters, which makes LVPruning simple to apply or remove. Our experiments show that LVPruning can effectively reduce up to 90% of vision tokens by the middle layer of LLaVA-1.5, resulting in a 62.1% decrease in inference Tera Floating-Point Operations Per Second (TFLOPs), with an average performance loss of just 0.45% across nine multi-modal benchmarks.", "authors": [ "Riza Batista-Navarro", "Chenghua Lin", "Jingyuan Sun", "Hao Li", "Yanze Xin", "Yizheng Sun" ], "published": "2025-01-23", "conference": null, "conference_url_abs": null, "conference_url_pdf": null, "proceeding": null }, { "id": "improving-contextual-faithfulness-of-large", "arxiv_id": "2501.13573", "nips_id": null, "url_abs": "https://arxiv.org/abs/2501.13573v1", "url_pdf": "https://arxiv.org/pdf/2501.13573v1.pdf", "title": "Improving Contextual Faithfulness of Large Language Models via Retrieval Heads-Induced Optimization", "abstract": "Ensuring contextual faithfulness in retrieval-augmented large language models (LLMs) is crucial for building trustworthy information-seeking systems, particularly in long-form question-answering (LFQA) scenarios. In this work, we identify a salient correlation between LFQA faithfulness and retrieval heads, a set of attention heads responsible for retrieving contextual information. Leveraging this insight, we propose RHIO, a framework designed to teach LLMs to explicitly discriminate between faithful and unfaithful generations. RHIO first augments unfaithful samples that simulate realistic model-intrinsic errors by selectively masking retrieval heads. Then, these samples are incorporated into joint training, enabling the model to distinguish unfaithful outputs from faithful ones conditioned on control tokens. Furthermore, these control tokens are leveraged to self-induce contrastive outputs, amplifying their difference through contrastive decoding. Additionally, to facilitate the evaluation of contextual faithfulness, we also introduce GroundBench, a comprehensive benchmark compiled from five existing LFQA datasets. Extensive experimental results on GroundBench demonstrate that RHIO significantly improves faithfulness, even outperforming GPT-4o.", "authors": [ "Bing Qin", "Guoping Hu", "Dayong Wu", "Baoxin Wang", "Yuxuan Gu", "Weihong Zhong", "Yangfan Ye", "Xiachong Feng", "Yuchun Fan", "Weitao Ma", "Xiaocheng Feng", "Lei Huang" ], "published": "2025-01-23", "conference": null, "conference_url_abs": null, "conference_url_pdf": null, "proceeding": null }, { "id": "can-large-language-models-understand", "arxiv_id": "2501.13391", "nips_id": null, "url_abs": "https://arxiv.org/abs/2501.13391v1", "url_pdf": "https://arxiv.org/pdf/2501.13391v1.pdf", "title": "Can Large Language Models Understand Preferences in Personalized Recommendation?", "abstract": "Large Language Models (LLMs) excel in various tasks, including personalized recommendations. Existing evaluation methods often focus on rating prediction, relying on regression errors between actual and predicted ratings. However, user rating bias and item quality, two influential factors behind rating scores, can obscure personal preferences in user-item pair data. To address this, we introduce PerRecBench, disassociating the evaluation from these two factors and assessing recommendation techniques on capturing the personal preferences in a grouped ranking manner. We find that the LLM-based recommendation techniques that are generally good at rating prediction fail to identify users' favored and disfavored items when the user rating bias and item quality are eliminated by grouping users. With PerRecBench and 19 LLMs, we find that while larger models generally outperform smaller ones, they still struggle with personalized recommendation. Our findings reveal the superiority of pairwise and listwise ranking approaches over pointwise ranking, PerRecBench's low correlation with traditional regression metrics, the importance of user profiles, and the role of pretraining data distributions. We further explore three supervised fine-tuning strategies, finding that merging weights from single-format training is promising but improving LLMs' understanding of user preferences remains an open research problem. Code and data are available at https://github.com/TamSiuhin/PerRecBench", "authors": [ "Meng Jiang", "Fengran Mo", "Zheyuan Liu", "Zhenyu Wu", "Qingkai Zeng", "Zinan Zeng", "Zhaoxuan Tan" ], "published": "2025-01-23", "conference": null, "conference_url_abs": null, "conference_url_pdf": null, "proceeding": null }, { "id": "do-as-we-do-not-as-you-think-the-conformity", "arxiv_id": "2501.13381", "nips_id": null, "url_abs": "https://arxiv.org/abs/2501.13381v1", "url_pdf": "https://arxiv.org/pdf/2501.13381v1.pdf", "title": "Do as We Do, Not as You Think: the Conformity of Large Language Models", "abstract": "Recent advancements in large language models (LLMs) revolutionize the field of intelligent agents, enabling collaborative multi-agent systems capable of tackling complex problems across various domains. However, the potential of conformity within these systems, analogous to phenomena like conformity bias and groupthink in human group dynamics, remains largely unexplored, raising concerns about their collective problem-solving capabilities and possible ethical implications. This paper presents a comprehensive study on conformity in LLM-driven multi-agent systems, focusing on three aspects: the existence of conformity, the factors influencing conformity, and potential mitigation strategies. In particular, we introduce BenchForm, a new conformity-oriented benchmark, featuring reasoning-intensive tasks and five distinct interaction protocols designed to probe LLMs' behavior in collaborative scenarios. Several representative LLMs are evaluated on BenchForm, using metrics such as conformity rate and independence rate to quantify conformity's impact. Our analysis delves into factors influencing conformity, including interaction time and majority size, and examines how the subject agent rationalizes its conforming behavior. Furthermore, we explore two strategies to mitigate conformity effects, i.e., developing enhanced personas and implementing a reflection mechanism. Several interesting findings regarding LLMs' conformity are derived from empirical results and case studies. We hope that these insights can pave the way for more robust and ethically-aligned collaborative AI systems. Our benchmark and code are available at BenchForm.", "authors": [ "Wenguan Wang", "Guikun Chen", "Zhiyuan Weng" ], "published": "2025-01-23", "conference": null, "conference_url_abs": null, "conference_url_pdf": null, "proceeding": null }, { "id": "hypothesis-generation-for-materials-discovery", "arxiv_id": "2501.13299", "nips_id": null, "url_abs": "https://arxiv.org/abs/2501.13299v1", "url_pdf": "https://arxiv.org/pdf/2501.13299v1.pdf", "title": "Hypothesis Generation for Materials Discovery and Design Using Goal-Driven and Constraint-Guided LLM Agents", "abstract": "Materials discovery and design are essential for advancing technology across various industries by enabling the development of application-specific materials. Recent research has leveraged Large Language Models (LLMs) to accelerate this process. We explore the potential of LLMs to generate viable hypotheses that, once validated, can expedite materials discovery. Collaborating with materials science experts, we curated a novel dataset from recent journal publications, featuring real-world goals, constraints, and methods for designing real-world applications. Using this dataset, we test LLM-based agents that generate hypotheses for achieving given goals under specific constraints. To assess the relevance and quality of these hypotheses, we propose a novel scalable evaluation metric that emulates the process a materials scientist would use to evaluate a hypothesis critically. Our curated dataset, proposed method, and evaluation framework aim to advance future research in accelerating materials discovery and design with LLMs.", "authors": [ "Chitta Baral", "Ashif Iquebal", "Divij Handa", "Kevin Coutinho", "Venkatesh Mishra", "Shrinidhi Kumbhar" ], "published": "2025-01-23", "conference": null, "conference_url_abs": null, "conference_url_pdf": null, "proceeding": null }, { "id": "rag-reward-optimizing-rag-with-reward", "arxiv_id": "2501.13264", "nips_id": null, "url_abs": "https://arxiv.org/abs/2501.13264v1", "url_pdf": "https://arxiv.org/pdf/2501.13264v1.pdf", "title": "RAG-Reward: Optimizing RAG with Reward Modeling and RLHF", "abstract": "Retrieval-augmented generation (RAG) enhances Large Language Models (LLMs) with relevant and up-to-date knowledge, improving their ability to answer knowledge-intensive questions. It has been shown to enhance both generation quality and trustworthiness. While numerous works have focused on improving retrieval, generation, and evaluation, the role of reward models in reinforcement learning for optimizing RAG and establishing automated benchmarking pipelines remains underexplored. In this paper, we introduce \\textbf{RAG-Reward}, a dataset designed to enable \\textit{hallucination-free, comprehensive, reliable, and efficient RAG}. We define four key metrics for assessing generation quality and develop an automated annotation pipeline that leverages multiple LLMs to generate outputs across diverse RAG scenarios. GPT-4o is used to evaluate and construct preference data. Using \\textbf{RAG-Reward}, we train reward models and apply reinforcement learning with human feedback (RLHF) to improve LLMs' effectiveness in RAG. Experimental results show that our reward model achieves state-of-the-art performance on a held-out test set, demonstrating both the effectiveness of our approach and the quality of our dataset. Furthermore, the improved generation quality of the trained policy model highlights the feasibility of using RLHF to enhance RAG pipelines.", "authors": [ "Cheng Niu", "Tong Zhang", "Yuanhao Wu", "Juno Zhu", "Juntong Song", "Hanning Zhang" ], "published": "2025-01-22", "conference": null, "conference_url_abs": null, "conference_url_pdf": null, "proceeding": null }, { "id": "large-language-model-driven-policy", "arxiv_id": "2501.13816", "nips_id": null, "url_abs": "https://arxiv.org/abs/2501.13816v1", "url_pdf": "https://arxiv.org/pdf/2501.13816v1.pdf", "title": "Large Language Model driven Policy Exploration for Recommender Systems", "abstract": "Recent advancements in Recommender Systems (RS) have incorporated Reinforcement Learning (RL), framing the recommendation as a Markov Decision Process (MDP). However, offline RL policies trained on static user data are vulnerable to distribution shift when deployed in dynamic online environments. Additionally, excessive focus on exploiting short-term relevant items can hinder exploration, leading to suboptimal recommendations and negatively impacting long-term user gains. Online RL-based RS also face challenges in production deployment, due to the risks of exposing users to untrained or unstable policies. Large Language Models (LLMs) offer a promising solution to mimic user objectives and preferences for pre-training policies offline to enhance the initial recommendations in online settings. Effectively managing distribution shift and balancing exploration are crucial for improving RL-based RS, especially when leveraging LLM-based pre-training. To address these challenges, we propose an Interaction-Augmented Learned Policy (iALP) that utilizes user preferences distilled from an LLM. Our approach involves prompting the LLM with user states to extract item preferences, learning rewards based on feedback, and updating the RL policy using an actor-critic framework. Furthermore, to deploy iALP in an online scenario, we introduce an adaptive variant, A-iALP, that implements a simple fine-tuning strategy (A-iALP$_{ft}$), and an adaptive approach (A-iALP$_{ap}$) designed to mitigate issues with compromised policies and limited exploration. Experiments across three simulated environments demonstrate that A-iALP introduces substantial performance improvements", "authors": [ "Joemon M. Jose", "Ioannis Arapakis", "Alexandros Karatzoglou", "Jie Wang" ], "published": "2025-01-23", "conference": null, "conference_url_abs": null, "conference_url_pdf": null, "proceeding": null }, { "id": "exploring-gpt-s-ability-as-a-judge-in-music", "arxiv_id": "2501.13261", "nips_id": null, "url_abs": "https://arxiv.org/abs/2501.13261v1", "url_pdf": "https://arxiv.org/pdf/2501.13261v1.pdf", "title": "Exploring GPT's Ability as a Judge in Music Understanding", "abstract": "Recent progress in text-based Large Language Models (LLMs) and their extended ability to process multi-modal sensory data have led us to explore their applicability in addressing music information retrieval (MIR) challenges. In this paper, we use a systematic prompt engineering approach for LLMs to solve MIR problems. We convert the music data to symbolic inputs and evaluate LLMs' ability in detecting annotation errors in three key MIR tasks: beat tracking, chord extraction, and key estimation. A concept augmentation method is proposed to evaluate LLMs' music reasoning consistency with the provided music concepts in the prompts. Our experiments tested the MIR capabilities of Generative Pre-trained Transformers (GPT). Results show that GPT has an error detection accuracy of 65.20%, 64.80%, and 59.72% in beat tracking, chord extraction, and key estimation tasks, respectively, all exceeding the random baseline. Moreover, we observe a positive correlation between GPT's error finding accuracy and the amount of concept information provided. The current findings based on symbolic music input provide a solid ground for future LLM-based MIR research.", "authors": [ "Ichiro Fujinaga", "Gus Xia", "Ziyu Wang", "Kun Fang" ], "published": "2025-01-22", "conference": null, "conference_url_abs": null, "conference_url_pdf": null, "proceeding": null }, { "id": "privacy-preserving-personalized-federated", "arxiv_id": "2501.13904", "nips_id": null, "url_abs": "https://arxiv.org/abs/2501.13904v2", "url_pdf": "https://arxiv.org/pdf/2501.13904v2.pdf", "title": "Privacy-Preserving Personalized Federated Prompt Learning for Multimodal Large Language Models", "abstract": "Multimodal Large Language Models (LLMs) are pivotal in revolutionizing customer support and operations by integrating multiple modalities such as text, images, and audio. Federated Prompt Learning (FPL) is a recently proposed approach that combines pre-trained multimodal LLMs such as vision-language models with federated learning to create personalized, privacy-preserving AI systems. However, balancing the competing goals of personalization, generalization, and privacy remains a significant challenge. Over-personalization can lead to overfitting, reducing generalizability, while stringent privacy measures, such as differential privacy, can hinder both personalization and generalization. In this paper, we propose a Differentially Private Federated Prompt Learning (DP-FPL) approach to tackle this challenge by leveraging a low-rank adaptation scheme to capture generalization while maintaining a residual term that preserves expressiveness for personalization. To ensure privacy, we introduce a novel method where we apply local differential privacy to the two low-rank components of the local prompt, and global differential privacy to the global prompt. Our approach mitigates the impact of privacy noise on the model performance while balancing the tradeoff between personalization and generalization. Extensive experiments demonstrate the effectiveness of our approach over other benchmarks.", "authors": [ "Ana Milanova", "Stacy Patterson", "Wei Sun", "Linh Tran" ], "published": "2025-01-23", "conference": null, "conference_url_abs": null, "conference_url_pdf": null, "proceeding": null }, { "id": "gpt-htree-a-decision-tree-framework", "arxiv_id": "2501.13743", "nips_id": null, "url_abs": "https://arxiv.org/abs/2501.13743v1", "url_pdf": "https://arxiv.org/pdf/2501.13743v1.pdf", "title": "GPT-HTree: A Decision Tree Framework Integrating Hierarchical Clustering and Large Language Models for Explainable Classification", "abstract": "This paper introduces GPT-HTree, a framework combining hierarchical clustering, decision trees, and large language models (LLMs) to address this challenge. By leveraging hierarchical clustering to segment individuals based on salient features, resampling techniques to balance class distributions, and decision trees to tailor classification paths within each cluster, GPT-HTree ensures both accuracy and interpretability. LLMs enhance the framework by generating human-readable cluster descriptions, bridging quantitative analysis with actionable insights.", "authors": [ "Yigit Ihlamur", "Aaron Ontoyin Yin", "Fuat Alican", "Te Pei" ], "published": "2025-01-23", "conference": null, "conference_url_abs": null, "conference_url_pdf": null, "proceeding": null }, { "id": "humorreject-decoupling-llm-safety-from", "arxiv_id": "2501.13677", "nips_id": null, "url_abs": "https://arxiv.org/abs/2501.13677v1", "url_pdf": "https://arxiv.org/pdf/2501.13677v1.pdf", "title": "HumorReject: Decoupling LLM Safety from Refusal Prefix via A Little Humor", "abstract": "Large Language Models (LLMs) commonly rely on explicit refusal prefixes for safety, making them vulnerable to prefix injection attacks. We introduce HumorReject, a novel data-driven approach that fundamentally reimagines LLM safety by decoupling it from refusal prefixes through the use of humor as an indirect refusal strategy. Rather than explicitly rejecting harmful instructions, HumorReject responds with contextually appropriate humor that naturally defuses potentially dangerous requests while maintaining engaging interactions. Our approach effectively addresses the common \"over-defense\" issues in existing safety mechanisms, demonstrating superior robustness against various attack vectors while preserving natural and high-quality interactions on legitimate tasks. Our findings suggest that innovations at the data level are even more fundamental than the alignment algorithm itself in achieving effective LLM safety, opening new directions for developing more resilient and user-friendly AI systems.", "authors": [ "Zhaoxiang Liu", "Jiacheng Luo", "Haichang Gao", "Zihui Wu" ], "published": "2025-01-23", "conference": null, "conference_url_abs": null, "conference_url_pdf": null, "proceeding": null }, { "id": "spurious-forgetting-in-continual-learning-of", "arxiv_id": "2501.13453", "nips_id": null, "url_abs": "https://arxiv.org/abs/2501.13453v1", "url_pdf": "https://arxiv.org/pdf/2501.13453v1.pdf", "title": "Spurious Forgetting in Continual Learning of Language Models", "abstract": "Recent advancements in large language models (LLMs) reveal a perplexing phenomenon in continual learning: despite extensive training, models experience significant performance declines, raising questions about task alignment and underlying knowledge retention. This study first explores the concept of \"spurious forgetting\", proposing that such performance drops often reflect a decline in task alignment rather than true knowledge loss. Through controlled experiments with a synthesized dataset, we investigate the dynamics of model performance during the initial training phases of new tasks, discovering that early optimization steps can disrupt previously established task alignments. Our theoretical analysis connects these shifts to orthogonal updates in model weights, providing a robust framework for understanding this behavior. Ultimately, we introduce a Freezing strategy that fix the bottom layers of the model, leading to substantial improvements in four continual learning scenarios. Our findings underscore the critical distinction between task alignment and knowledge retention, paving the way for more effective strategies in continual learning.", "authors": [ "Qianli Ma", "Shengjie Qiu", "Xidi Cai", "Junhao Zheng" ], "published": "2025-01-23", "conference": null, "conference_url_abs": null, "conference_url_pdf": null, "proceeding": null }, { "id": "iserve-an-intent-based-serving-system-for", "arxiv_id": "2501.13111", "nips_id": null, "url_abs": "https://arxiv.org/abs/2501.13111v1", "url_pdf": "https://arxiv.org/pdf/2501.13111v1.pdf", "title": "iServe: An Intent-based Serving System for LLMs", "abstract": "Large Language Models (LLMs) are becoming ubiquitous across industries, where applications demand they fulfill diverse user intents. However, developers currently face the challenge of manually exploring numerous deployment configurations - combinations of parallelism and compression techniques that impact resource usage, latency, cost, and accuracy - to meet these intents. Assessing the impact of these configurations on user metrics requires extensive, costly profiling for each model. Existing approaches avoid this expense by using fixed, static configurations, but this often leads to sub-optimal performance and higher costs. Moreover, none of these solutions dynamically adapt to changing user intents to balance latency and cost, effectively. We present iServe, an automated, intent-based system for distributed LLM inference. Instead of manually selecting deployment configurations, developers simply specify their intent - such as minimizing latency, reducing cost, or meeting specific targets for either. iServe introduces fingerprints, lightweight representations of LLMs, to efficiently estimate how different configurations impact latency and memory usage. Based on these insights and GPU availability, iServe dynamically selects the optimal configuration to align with the user's intent. For various LLMs and query arrival rates, iServe best meets user intents compared to state-of-the-art systems by reducing latency by 77.62% and SLO violations by 7.09x while improving GPU throughput by 4.72x. Moreover, iServe's fingerprint-based profiling reduces profiling cost by 6.05x (GPU-hours) compared to baselines.", "authors": [ "Neeraja J. Yadwadkar", "Prasoon Sinha", "Tianrui Hu", "Dimitrios Liakopoulos" ], "published": "2025-01-08", "conference": null, "conference_url_abs": null, "conference_url_pdf": null, "proceeding": null }, { "id": "large-language-model-based-semantic", "arxiv_id": "2501.12988", "nips_id": null, "url_abs": "https://arxiv.org/abs/2501.12988v1", "url_pdf": "https://arxiv.org/pdf/2501.12988v1.pdf", "title": "Large Language Model-Based Semantic Communication System for Image Transmission", "abstract": "The remarkable success of Large Language Models (LLMs) in understanding and generating various data types, such as images and text, has demonstrated their ability to process and extract semantic information across diverse domains. This transformative capability lays the foundation for semantic communications, enabling highly efficient and intelligent communication systems. In this work, we present a novel OFDM-based semantic communication framework for image transmission. We propose an innovative semantic encoder design that leverages the ability of LLMs to extract the meaning of transmitted data rather than focusing on its raw representation. On the receiver side, we design an LLM-based semantic decoder capable of comprehending context and generating the most appropriate representation to fit the given context. We evaluate our proposed system under different scenarios, including Urban Macro-cell environments with varying speed ranges. The evaluation metrics demonstrate that our proposed system reduces the data size 4250 times, while achieving a higher data rate compared to conventional communication methods. This approach offers a robust and scalable solution to unlock the full potential of 6G connectivity.", "authors": [ "Osama Saleem", "Soheyb Ribouh" ], "published": "2025-01-22", "conference": null, "conference_url_abs": null, "conference_url_pdf": null, "proceeding": null }, { "id": "contextualizing-recommendation-explanations", "arxiv_id": "2501.12152", "nips_id": null, "url_abs": "https://arxiv.org/abs/2501.12152v1", "url_pdf": "https://arxiv.org/pdf/2501.12152v1.pdf", "title": "Contextualizing Recommendation Explanations with LLMs: A User Study", "abstract": "Large language models (LLMs) are increasingly prevalent in recommender systems, where LLMs can be used to generate personalized recommendations. Here, we examine how different LLM-generated explanations for movie recommendations affect users' perceptions of cognitive, affective, and utilitarian needs and consumption intentions. In a pre-registered, between-subject online experiment (N=759) and follow-up interviews (N=30), we compare (a) LLM-generated generic explanations, and (b) LLM-generated contextualized explanations. Our findings show that contextualized explanations (i.e., explanations that incorporate users' past behaviors) effectively meet users' cognitive needs while increasing users' intentions to watch recommended movies. However, adding explanations offers limited benefits in meeting users' utilitarian and affective needs, raising concerns about the proper design and implications of LLM-generated explanations. Qualitative insights from interviews reveal that referencing users' past preferences enhances trust and understanding but can feel excessive if overused. Furthermore, users with more active and positive engagement with the recommender system and movie-watching get substantial gains from contextualized explanations. Overall, our research clarifies how LLM-generated recommendations influence users' motivations and behaviors, providing valuable insights for the future development of user-centric recommender systems, a key element in social media platforms and online ecosystems.", "authors": [], "published": "2025-01-21", "conference": null, "conference_url_abs": null, "conference_url_pdf": null, "proceeding": null }, { "id": "a-functional-software-reference-architecture", "arxiv_id": "2501.12904", "nips_id": null, "url_abs": "https://arxiv.org/abs/2501.12904v1", "url_pdf": "https://arxiv.org/pdf/2501.12904v1.pdf", "title": "A Functional Software Reference Architecture for LLM-Integrated Systems", "abstract": "The integration of large language models into software systems is transforming capabilities such as natural language understanding, decision-making, and autonomous task execution. However, the absence of a commonly accepted software reference architecture hinders systematic reasoning about their design and quality attributes. This gap makes it challenging to address critical concerns like privacy, security, modularity, and interoperability, which are increasingly important as these systems grow in complexity and societal impact. In this paper, we describe our \\textit{emerging} results for a preliminary functional reference architecture as a conceptual framework to address these challenges and guide the design, evaluation, and evolution of large language model-integrated systems. We identify key architectural concerns for these systems, informed by current research and practice. We then evaluate how the architecture addresses these concerns and validate its applicability using three open-source large language model-integrated systems in computer vision, text processing, and coding.", "authors": [], "published": "2025-01-22", "conference": null, "conference_url_abs": null, "conference_url_pdf": null, "proceeding": null }, { "id": "evaluating-efficiency-and-engagement-in", "arxiv_id": "2501.12128", "nips_id": null, "url_abs": "https://arxiv.org/abs/2501.12128v1", "url_pdf": "https://arxiv.org/pdf/2501.12128v1.pdf", "title": "Evaluating Efficiency and Engagement in Scripted and LLM-Enhanced Human-Robot Interactions", "abstract": "To achieve natural and intuitive interaction with people, HRI frameworks combine a wide array of methods for human perception, intention communication, human-aware navigation and collaborative action. In practice, when encountering unpredictable behavior of people or unexpected states of the environment, these frameworks may lack the ability to dynamically recognize such states, adapt and recover to resume the interaction. Large Language Models (LLMs), owing to their advanced reasoning capabilities and context retention, present a promising solution for enhancing robot adaptability. This potential, however, may not directly translate to improved interaction metrics. This paper considers a representative interaction with an industrial robot involving approach, instruction, and object manipulation, implemented in two conditions: (1) fully scripted and (2) including LLM-enhanced responses. We use gaze tracking and questionnaires to measure the participants' task efficiency, engagement, and robot perception. The results indicate higher subjective ratings for the LLM condition, but objective metrics show that the scripted condition performs comparably, particularly in efficiency and focus during simple tasks. We also note that the scripted condition may have an edge over LLM-enhanced responses in terms of response latency and energy consumption, especially for trivial and repetitive interactions.", "authors": [], "published": "2025-01-21", "conference": null, "conference_url_abs": null, "conference_url_pdf": null, "proceeding": null }, { "id": "dissecting-the-nvidia-hopper-architecture", "arxiv_id": "2501.12084", "nips_id": null, "url_abs": "https://arxiv.org/abs/2501.12084v1", "url_pdf": "https://arxiv.org/pdf/2501.12084v1.pdf", "title": "Dissecting the NVIDIA Hopper Architecture through Microbenchmarking and Multiple Level Analysis", "abstract": "Modern GPUs, with their specialized hardware like tensor cores, are essential for demanding AI and deep learning applications. This study presents a comprehensive, multi-level microbenchmarking analysis of the NVIDIA Hopper GPU architecture, delving into its performance characteristics and novel features. We benchmark Hopper's memory subsystem latency and throughput, comparing its L2 partitioned cache behavior and global memory access patterns against recent GPU generations, Ampere and Ada Lovelace. Our analysis reveals significant performance differences and architectural improvements in Hopper. A core contribution of this work is a detailed evaluation of Hopper's fourth-generation tensor cores, including their FP8 precision support and the novel asynchronous wgmma instructions, assessing their impact on matrix multiply-accumulate operations. We further investigate the performance implications of other key Hopper innovations: DPX instructions for accelerating dynamic programming algorithms, distributed shared memory (DSM) for inter-SM communication, and the Tensor Memory Accelerator (TMA) for asynchronous data movement. This multi-level approach encompasses instruction-level microbenchmarks, library-level analysis of the Transformer Engine, and application-level benchmarks of tensor core performance within large language models. Our findings provide valuable, in-depth insights for software developers seeking to optimize performance and develop accurate performance models for the Hopper architecture, ultimately contributing to a deeper understanding of its potential for accelerating AI and other computationally intensive workloads.", "authors": [], "published": "2025-01-21", "conference": null, "conference_url_abs": null, "conference_url_pdf": null, "proceeding": null }, { "id": "pinnsagent-automated-pde-surrogation-with", "arxiv_id": "2501.12053", "nips_id": null, "url_abs": "https://arxiv.org/abs/2501.12053v1", "url_pdf": "https://arxiv.org/pdf/2501.12053v1.pdf", "title": "PINNsAgent: Automated PDE Surrogation with Large Language Models", "abstract": "Solving partial differential equations (PDEs) using neural methods has been a long-standing scientific and engineering research pursuit. Physics-Informed Neural Networks (PINNs) have emerged as a promising alternative to traditional numerical methods for solving PDEs. However, the gap between domain-specific knowledge and deep learning expertise often limits the practical application of PINNs. Previous works typically involve manually conducting extensive PINNs experiments and summarizing heuristic rules for hyperparameter tuning. In this work, we introduce PINNsAgent, a novel surrogation framework that leverages large language models (LLMs) and utilizes PINNs as a foundation to bridge the gap between domain-specific knowledge and deep learning. Specifically, PINNsAgent integrates (1) Physics-Guided Knowledge Replay (PGKR), which encodes the essential characteristics of PDEs and their associated best-performing PINNs configurations into a structured format, enabling efficient knowledge transfer from solved PDEs to similar problems and (2) Memory Tree Reasoning, a strategy that effectively explores the search space for optimal PINNs architectures. By leveraging LLMs and exploration strategies, PINNsAgent enhances the automation and efficiency of PINNs-based solutions. We evaluate PINNsAgent on 14 benchmark PDEs, demonstrating its effectiveness in automating the surrogation process and significantly improving the accuracy of PINNs-based solutions.", "authors": [], "published": "2025-01-21", "conference": null, "conference_url_abs": null, "conference_url_pdf": null, "proceeding": null }, { "id": "open-finllm-leaderboard-towards-financial-ai", "arxiv_id": "2501.10963", "nips_id": null, "url_abs": "https://arxiv.org/abs/2501.10963v1", "url_pdf": "https://arxiv.org/pdf/2501.10963v1.pdf", "title": "Open FinLLM Leaderboard: Towards Financial AI Readiness", "abstract": "Financial large language models (FinLLMs) with multimodal capabilities are envisioned to revolutionize applications across business, finance, accounting, and auditing. However, real-world adoption requires robust benchmarks of FinLLMs' and agents' performance. Maintaining an open leaderboard of models is crucial for encouraging innovative adoption and improving model effectiveness. In collaboration with Linux Foundation and Hugging Face, we create an open FinLLM leaderboard, which serves as an open platform for assessing and comparing LLMs' performance on a wide spectrum of financial tasks. By demoncratizing access to advanced AI tools and financial knowledge, a chatbot or agent may enhance the analytical capabilities of the general public to a professional-level within a few months of usage. This open leaderboard welcomes contributions from academia, open-source community, industry, and stakeholders. In particular, we encourage contributions of new datasets, tasks, and models for continual update. Through fostering a collaborative and open ecosystem, we seek to ensure the long-term sustainability and relevance of LLMs and agents as they evolve with the financial sector's needs.", "authors": [], "published": "2025-01-19", "conference": null, "conference_url_abs": null, "conference_url_pdf": null, "proceeding": null }, { "id": "generative-ai-misuse-potential-in-cyber", "arxiv_id": "2501.12883", "nips_id": null, "url_abs": "https://arxiv.org/abs/2501.12883v3", "url_pdf": "https://arxiv.org/pdf/2501.12883v3.pdf", "title": "Generative AI Misuse Potential in Cyber Security Education: A Case Study of a UK Degree Program", "abstract": "Recent advances in generative artificial intelligence (AI), such as ChatGPT, Google Gemini, and other large language models (LLMs), pose significant challenges to upholding academic integrity in higher education. This paper investigates the susceptibility of a Master's-level cyber security degree program at a UK Russell Group university, accredited by a leading national body, to LLM misuse. Through the application and extension of a quantitative assessment framework, we identify a high exposure to misuse, particularly in independent project- and report-based assessments. Contributing factors, including block teaching and a predominantly international cohort, are highlighted as potential amplifiers of these vulnerabilities. To address these challenges, we discuss the adoption of LLM-resistant assessments, detection tools, and the importance of fostering an ethical learning environment. These approaches aim to uphold academic standards while preparing students for the complexities of real-world cyber security.", "authors": [], "published": "2025-01-22", "conference": null, "conference_url_abs": null, "conference_url_pdf": null, "proceeding": null }, { "id": "50-shades-of-deceptive-patterns-a-unified", "arxiv_id": "2501.13351", "nips_id": null, "url_abs": "https://arxiv.org/abs/2501.13351v3", "url_pdf": "https://arxiv.org/pdf/2501.13351v3.pdf", "title": "50 Shades of Deceptive Patterns: A Unified Taxonomy, Multimodal Detection, and Security Implications", "abstract": "Deceptive patterns (DPs) are user interface designs deliberately crafted to manipulate users into unintended decisions, often by exploiting cognitive biases for the benefit of companies or services. While numerous studies have explored ways to identify these deceptive patterns, many existing solutions require significant human intervention and struggle to keep pace with the evolving nature of deceptive designs. To address these challenges, we expanded the deceptive pattern taxonomy from security and privacy perspectives, refining its categories and scope. We created a comprehensive dataset of deceptive patterns by integrating existing small-scale datasets with new samples, resulting in 6,725 images and 10,421 DP instances from mobile apps and websites. We then developed DPGuard, a novel automatic tool leveraging commercial multimodal large language models (MLLMs) for deceptive pattern detection. Experimental results show that DPGuard outperforms state-of-the-art methods. Finally, we conducted an extensive empirical evaluation on 2,000 popular mobile apps and websites, revealing that 23.61% of mobile screenshots and 47.27% of website screenshots feature at least one deceptive pattern instance. Through four unexplored case studies that inform security implications, we highlight the critical importance of the unified taxonomy in addressing the growing challenges of Internet deception.", "authors": [], "published": "2025-01-23", "conference": null, "conference_url_abs": null, "conference_url_pdf": null, "proceeding": null }, { "id": "bypassing-array-canaries-via-autonomous", "arxiv_id": "2501.13256", "nips_id": null, "url_abs": "https://arxiv.org/abs/2501.13256v1", "url_pdf": "https://arxiv.org/pdf/2501.13256v1.pdf", "title": "Bypassing Array Canaries via Autonomous Function Call Resolution", "abstract": "We observed the Array Canary, a novel JavaScript anti-analysis technique currently exploited in-the-wild by the Phishing-as-a-Service framework Darcula. The Array Canary appears to be an advanced form of the array shuffling techniques employed by the Emotet JavaScript downloader. In practice, a series of Array Canaries are set within a string array and if modified will cause the program to endlessly loop. In this paper, we demonstrate how an Array Canary works and discuss Autonomous Function Call Resolution (AFCR), which is a method we created to bypass Array Canaries. We also introduce Arphsy, a proof-of-concept for AFCR designed to guide Large Language Models and security researchers in the deobfuscation of \"canaried\" JavaScript code. We accomplish this by (i) Finding and extracting all Immediately Invoked Function Expressions from a canaried file, (ii) parsing the file's Abstract Syntax Tree for any function that does not implement imported function calls, (iii) identifying the most reassigned variable and its corresponding function body, (iv) calculating the length of the largest string array and uses it to determine the offset values within the canaried file, (v) aggregating all the previously identified functions into a single file, and (vi) appending driver code into the verified file and using it to deobfuscate the canaried file.", "authors": [], "published": "2025-01-22", "conference": null, "conference_url_abs": null, "conference_url_pdf": null, "proceeding": null }, { "id": "you-can-t-eat-your-cake-and-have-it-too-the", "arxiv_id": "2501.12210", "nips_id": null, "url_abs": "https://arxiv.org/abs/2501.12210v1", "url_pdf": "https://arxiv.org/pdf/2501.12210v1.pdf", "title": "You Can't Eat Your Cake and Have It Too: The Performance Degradation of LLMs with Jailbreak Defense", "abstract": "With the rise of generative large language models (LLMs) like LLaMA and ChatGPT, these models have significantly transformed daily life and work by providing advanced insights. However, as jailbreak attacks continue to circumvent built-in safety mechanisms, exploiting carefully crafted scenarios or tokens, the safety risks of LLMs have come into focus. While numerous defense strategies--such as prompt detection, modification, and model fine-tuning--have been proposed to counter these attacks, a critical question arises: do these defenses compromise the utility and usability of LLMs for legitimate users? Existing research predominantly focuses on the effectiveness of defense strategies without thoroughly examining their impact on performance, leaving a gap in understanding the trade-offs between LLM safety and performance. Our research addresses this gap by conducting a comprehensive study on the utility degradation, safety elevation, and exaggerated-safety escalation of LLMs with jailbreak defense strategies. We propose USEBench, a novel benchmark designed to evaluate these aspects, along with USEIndex, a comprehensive metric for assessing overall model performance. Through experiments on seven state-of-the-art LLMs, we found that mainstream jailbreak defenses fail to ensure both safety and performance simultaneously. Although model-finetuning performs the best overall, their effectiveness varies across LLMs. Furthermore, vertical comparisons reveal that developers commonly prioritize performance over safety when iterating or fine-tuning their LLMs.", "authors": [], "published": "2025-01-21", "conference": null, "conference_url_abs": null, "conference_url_pdf": null, "proceeding": null }, { "id": "dagger-behind-smile-fool-llms-with-a-happy", "arxiv_id": "2501.13115", "nips_id": null, "url_abs": "https://arxiv.org/abs/2501.13115v1", "url_pdf": "https://arxiv.org/pdf/2501.13115v1.pdf", "title": "Dagger Behind Smile: Fool LLMs with a Happy Ending Story", "abstract": "The wide adoption of Large Language Models (LLMs) has attracted significant attention from \\textit{jailbreak} attacks, where adversarial prompts crafted through optimization or manual design exploit LLMs to generate malicious content. However, optimization-based attacks have limited efficiency and transferability, while manual designs are either easily detectable or demand intricate interactions with LLMs. In this paper, we first point out a novel perspective for jailbreak attacks: LLMs are more responsive to \\textit{positive} prompts. Based on this, we deploy Happy Ending Attack (HEA) to wrap up a malicious request in a scenario template involving a positive prompt formed mainly via a \\textit{happy ending}, it thus fools LLMs into jailbreaking either immediately or at a follow-up malicious request. This has made HEA both efficient and effective, as it requires only up to two steps to fully jailbreak LLMs. Extensive experiments show that our HEA can successfully jailbreak on state-of-the-art LLMs, including GPT-4o, Llama3-70b, Gemini-pro, and achieves 88.79\\% Attack Success Rate on average. We also provide potential quantitative explanations for the success of HEA.", "authors": [ "Jun Luo", "Jiayi Kong", "Shuo Huai", "Zhixin Xie", "Xurui Song" ], "published": "2025-01-19", "conference": null, "conference_url_abs": null, "conference_url_pdf": null, "proceeding": null }, { "id": "enhancing-llms-for-governance-with-human", "arxiv_id": "2501.13802", "nips_id": null, "url_abs": "https://arxiv.org/abs/2501.13802v1", "url_pdf": "https://arxiv.org/pdf/2501.13802v1.pdf", "title": "Enhancing LLMs for Governance with Human Oversight: Evaluating and Aligning LLMs on Expert Classification of Climate Misinformation for Detecting False or Misleading Claims about Climate Change", "abstract": "Climate misinformation is a problem that has the potential to be substantially aggravated by the development of Large Language Models (LLMs). In this study we evaluate the potential for LLMs to be part of the solution for mitigating online dis/misinformation rather than the problem. Employing a public expert annotated dataset and a curated sample of social media content we evaluate the performance of proprietary vs. open source LLMs on climate misinformation classification task, comparing them to existing climate-focused computer-assisted tools and expert assessments. Results show (1) state-of-the-art (SOTA) open-source models substantially under-perform in classifying climate misinformation compared to proprietary models, (2) existing climate-focused computer-assisted tools leveraging expert-annotated datasets continues to outperform many of proprietary models, including GPT-4o, and (3) demonstrate the efficacy and generalizability of fine-tuning GPT-3.5-turbo on expert annotated dataset in classifying claims about climate change at the equivalency of climate change experts with over 20 years of experience in climate communication. These findings highlight 1) the importance of incorporating human-oversight, such as incorporating expert-annotated datasets in training LLMs, for governance tasks that require subject-matter expertise like classifying climate misinformation, and 2) the potential for LLMs in facilitating civil society organizations to engage in various governance tasks such as classifying false or misleading claims in domains beyond climate change such as politics and health science.", "authors": [], "published": "2025-01-23", "conference": null, "conference_url_abs": null, "conference_url_pdf": null, "proceeding": null }, { "id": "scopes-of-alignment", "arxiv_id": "2501.12405", "nips_id": null, "url_abs": "https://arxiv.org/abs/2501.12405v1", "url_pdf": "https://arxiv.org/pdf/2501.12405v1.pdf", "title": "Scopes of Alignment", "abstract": "Much of the research focus on AI alignment seeks to align large language models and other foundation models to the context-less and generic values of helpfulness, harmlessness, and honesty. Frontier model providers also strive to align their models with these values. In this paper, we motivate why we need to move beyond such a limited conception and propose three dimensions for doing so. The first scope of alignment is competence: knowledge, skills, or behaviors the model must possess to be useful for its intended purpose. The second scope of alignment is transience: either semantic or episodic depending on the context of use. The third scope of alignment is audience: either mass, public, small-group, or dyadic. At the end of the paper, we use the proposed framework to position some technologies and workflows that go beyond prevailing notions of alignment.", "authors": [ "Justin D. Weisz", "Matthew Riemer", "Djallel Bouneffouf", "Zahra Ashktorab", "Kush R. Varshney" ], "published": "2025-01-15", "conference": null, "conference_url_abs": null, "conference_url_pdf": null, "proceeding": null }, { "id": "the-elevate-ai-llms-framework-an-evaluation", "arxiv_id": "2501.12394", "nips_id": null, "url_abs": "https://arxiv.org/abs/2501.12394v1", "url_pdf": "https://arxiv.org/pdf/2501.12394v1.pdf", "title": "The ELEVATE-AI LLMs Framework: An Evaluation Framework for Use of Large Language Models in HEOR: an ISPOR Working Group Report", "abstract": "Introduction. Generative Artificial Intelligence, particularly large language models (LLMs), offers transformative potential for Health Economics and Outcomes Research (HEOR). However, evaluating the quality, transparency, and rigor of LLM-assisted research lacks standardized guidance. This article introduces the ELEVATE AI LLMs framework and checklist, designed to support researchers and reviewers in assessing LLM use in HEOR. Methods. The ELEVATE AI LLMs framework was developed through a targeted review of existing guidelines and evaluation frameworks. The framework comprises ten evaluation domains, including model characteristics, accuracy, comprehensiveness, and fairness. The accompanying checklist operationalizes the framework. To validate the framework, we applied it to two published studies, demonstrating its usability across different HEOR tasks. Results. The ELEVATE AI LLMs framework provides a comprehensive structure for evaluating LLM-assisted research, while the checklist facilitates practical application. Validation of the framework and checklist on studies of systematic literature reviews and health economic modeling highlighted their ability to identify strengths and gaps in reporting. Limitations. While the ELEVATE AI LLMs framework provides robust guidance, its broader generalizability and applicability to diverse HEOR tasks require further empirical testing. Additionally, several metrics adapted from computer science need further validation in HEOR contexts. Conclusion. The ELEVATE AI LLMs framework and checklist fill a critical gap in HEOR by offering structured guidance for evaluating LLM-assisted research. By promoting transparency, accuracy, and reproducibility, they aim to standardize and improve the integration of LLMs into HEOR, ensuring their outputs meet the field's rigorous standards.", "authors": [ "Turgay Ayer", "Jagpreet Chhatwal", "Hua Xu", "Xiaoyan Wang", "Mitchell K. Higashi", "Jiang Bian", "Dalia Dawoud", "Rachael L. Fleurence" ], "published": "2024-12-23", "conference": null, "conference_url_abs": null, "conference_url_pdf": null, "proceeding": null }, { "id": "transparency-security-and-workplace-training", "arxiv_id": "2501.10389", "nips_id": null, "url_abs": "https://arxiv.org/abs/2501.10389v1", "url_pdf": "https://arxiv.org/pdf/2501.10389v1.pdf", "title": "Transparency, Security, and Workplace Training & Awareness in the Age of Generative AI", "abstract": "This paper investigates the impacts of the rapidly evolving landscape of generative Artificial Intelligence (AI) development. Emphasis is given to how organizations grapple with a critical imperative: reevaluating their policies regarding AI usage in the workplace. As AI technologies advance, ethical considerations, transparency, data privacy, and their impact on human labor intersect with the drive for innovation and efficiency. Our research explores publicly accessible large language models (LLMs) that often operate on the periphery, away from mainstream scrutiny. These lesser-known models have received limited scholarly analysis and may lack comprehensive restrictions and safeguards. Specifically, we examine Gab AI, a platform that centers around unrestricted communication and privacy, allowing users to interact freely without censorship. Generative AI chatbots are increasingly prevalent, but cybersecurity risks have also escalated. Organizations must carefully navigate this evolving landscape by implementing transparent AI usage policies. Frequent training and policy updates are essential to adapt to emerging threats. Insider threats, whether malicious or unwitting, continue to pose one of the most significant cybersecurity challenges in the workplace. Our research is on the lesser-known publicly accessible LLMs and their implications for workplace policies. We contribute to the ongoing discourse on AI ethics, transparency, and security by emphasizing the need for well-thought-out guidelines and vigilance in policy maintenance.", "authors": [], "published": "2024-12-19", "conference": null, "conference_url_abs": null, "conference_url_pdf": null, "proceeding": null }, { "id": "beyond-the-sum-unlocking-ai-agents-potential", "arxiv_id": "2501.10388", "nips_id": null, "url_abs": "https://arxiv.org/abs/2501.10388v2", "url_pdf": "https://arxiv.org/pdf/2501.10388v2.pdf", "title": "Beyond the Sum: Unlocking AI Agents Potential Through Market Forces", "abstract": "The emergence of Large Language Models has fundamentally transformed the capabilities of AI agents, enabling a new class of autonomous agents capable of interacting with their environment through dynamic code generation and execution. These agents possess the theoretical capacity to operate as independent economic actors within digital markets, offering unprecedented potential for value creation through their distinct advantages in operational continuity, perfect replication, and distributed learning capabilities. However, contemporary digital infrastructure, architected primarily for human interaction, presents significant barriers to their participation. This work presents a systematic analysis of the infrastructure requirements necessary for AI agents to function as autonomous participants in digital markets. We examine four key areas - identity and authorization, service discovery, interfaces, and payment systems - to show how existing infrastructure actively impedes agent participation. We argue that addressing these infrastructure challenges represents more than a technical imperative; it constitutes a fundamental step toward enabling new forms of economic organization. Much as traditional markets enable human intelligence to coordinate complex activities beyond individual capability, markets incorporating AI agents could dramatically enhance economic efficiency through continuous operation, perfect information sharing, and rapid adaptation to changing conditions. The infrastructure challenges identified in this work represent key barriers to realizing this potential.", "authors": [ "Pol Alvarez Vecino", "Jordi Montes Sanabria" ], "published": "2024-12-19", "conference": null, "conference_url_abs": null, "conference_url_pdf": null, "proceeding": null }, { "id": "autonomous-microscopy-experiments-through", "arxiv_id": "2501.10385", "nips_id": null, "url_abs": "https://arxiv.org/abs/2501.10385v1", "url_pdf": "https://arxiv.org/pdf/2501.10385v1.pdf", "title": "Autonomous Microscopy Experiments through Large Language Model Agents", "abstract": "The emergence of large language models (LLMs) has accelerated the development of self-driving laboratories (SDLs) for materials research. Despite their transformative potential, current SDL implementations rely on rigid, predefined protocols that limit their adaptability to dynamic experimental scenarios across different labs. A significant challenge persists in measuring how effectively AI agents can replicate the adaptive decision-making and experimental intuition of expert scientists. Here, we introduce AILA (Artificially Intelligent Lab Assistant), a framework that automates atomic force microscopy (AFM) through LLM-driven agents. Using AFM as an experimental testbed, we develop AFMBench-a comprehensive evaluation suite that challenges AI agents based on language models like GPT-4o and GPT-3.5 to perform tasks spanning the scientific workflow: from experimental design to results analysis. Our systematic assessment shows that state-of-the-art language models struggle even with basic tasks such as documentation retrieval, leading to a significant decline in performance in multi-agent coordination scenarios. Further, we observe that LLMs exhibit a tendency to not adhere to instructions or even divagate to additional tasks beyond the original request, raising serious concerns regarding safety alignment aspects of AI agents for SDLs. Finally, we demonstrate the application of AILA on increasingly complex experiments open-ended experiments: automated AFM calibration, high-resolution feature detection, and mechanical property measurement. Our findings emphasize the necessity for stringent benchmarking protocols before deploying AI agents as laboratory assistants across scientific disciplines.", "authors": [ "N. M. Anoop Krishnan", "Nitya Nand Gosvami", "Lothar Wondraczek", "Katrin Wondraczek", "Morten M. Smedskjaer", "Mohd Zaki", "Jitendra Soni", "Indrajeet Mandal" ], "published": "2024-12-18", "conference": null, "conference_url_abs": null, "conference_url_pdf": null, "proceeding": null }, { "id": "an-empirical-characterization-of-outages-and", "arxiv_id": "2501.12469", "nips_id": null, "url_abs": "https://arxiv.org/abs/2501.12469v1", "url_pdf": "https://arxiv.org/pdf/2501.12469v1.pdf", "title": "An Empirical Characterization of Outages and Incidents in Public Services for Large Language Models", "abstract": "People and businesses increasingly rely on public LLM services, such as ChatGPT, DALLE, and Claude. Understanding their outages, and particularly measuring their failure-recovery processes, is becoming a stringent problem. However, only limited studies exist in this emerging area. Addressing this problem, in this work we conduct an empirical characterization of outages and failure-recovery in public LLM services. We collect and prepare datasets for 8 commonly used LLM services across 3 major LLM providers, including market-leads OpenAI and Anthropic. We conduct a detailed analysis of failure recovery statistical properties, temporal patterns, co-occurrence, and the impact range of outage-causing incidents. We make over 10 observations, among which: (1) Failures in OpenAI's ChatGPT take longer to resolve but occur less frequently than those in Anthropic's Claude;(2) OpenAI and Anthropic service failures exhibit strong weekly and monthly periodicity; and (3) OpenAI services offer better failure-isolation than Anthropic services. Our research explains LLM failure characteristics and thus enables optimization in building and using LLM systems. FAIR data and code are publicly available on https://zenodo.org/records/14018219 and https://github.com/atlarge-research/llm-service-analysis.", "authors": [], "published": "2025-01-21", "conference": null, "conference_url_abs": null, "conference_url_pdf": null, "proceeding": null }, { "id": "leveraging-large-language-models-for-23", "arxiv_id": "2501.12221", "nips_id": null, "url_abs": "https://arxiv.org/abs/2501.12221v1", "url_pdf": "https://arxiv.org/pdf/2501.12221v1.pdf", "title": "Leveraging Large Language Models for Realizing Truly Intelligent User Interfaces", "abstract": "The number of published scholarly articles is growing at a significant rate, making scholarly knowledge organization increasingly important. Various approaches have been proposed to organize scholarly information, including describing scholarly knowledge semantically leveraging knowledge graphs. Transforming unstructured knowledge, presented within articles, to structured and semantically represented knowledge generally requires human intelligence and labor since natural language processing methods alone typically do not render sufficient precision and recall for many applications. With the recent developments of Large Language Models (LLMs), it becomes increasingly possible to provide truly intelligent user interfaces guiding humans in the transformation process. We present an approach to integrate non-intrusive LLMs guidance into existing user interfaces. More specifically, we integrate LLM-supported user interface components into an existing scholarly knowledge infrastructure. Additionally, we provide our experiences with LLM integration, detailing best practices and obstacles. Finally, we evaluate the approach using a small-scale user evaluation with domain experts.", "authors": [], "published": "2025-01-21", "conference": null, "conference_url_abs": null, "conference_url_pdf": null, "proceeding": null }, { "id": "exploring-the-potential-of-large-language-11", "arxiv_id": "2501.10630", "nips_id": null, "url_abs": "https://arxiv.org/abs/2501.10630v1", "url_pdf": "https://arxiv.org/pdf/2501.10630v1.pdf", "title": "Exploring the Potential of Large Language Models for Massive MIMO CSI Feedback", "abstract": "Large language models (LLMs) have achieved remarkable success across a wide range of tasks, particularly in natural language processing and computer vision. This success naturally raises an intriguing yet unexplored question: Can LLMs be harnessed to tackle channel state information (CSI) compression and feedback in massive multiple-input multiple-output (MIMO) systems? Efficient CSI feedback is a critical challenge in next-generation wireless communication. In this paper, we pioneer the use of LLMs for CSI compression, introducing a novel framework that leverages the powerful denoising capabilities of LLMs -- capable of error correction in language tasks -- to enhance CSI reconstruction performance. To effectively adapt LLMs to CSI data, we design customized pre-processing, embedding, and post-processing modules tailored to the unique characteristics of wireless signals. Extensive numerical results demonstrate the promising potential of LLMs in CSI feedback, opening up possibilities for this research direction.", "authors": [], "published": "2025-01-18", "conference": null, "conference_url_abs": null, "conference_url_pdf": null, "proceeding": null }, { "id": "paradigm-based-automatic-hdl-code-generation", "arxiv_id": "2501.12702", "nips_id": null, "url_abs": "https://arxiv.org/abs/2501.12702v1", "url_pdf": "https://arxiv.org/pdf/2501.12702v1.pdf", "title": "Paradigm-Based Automatic HDL Code Generation Using LLMs", "abstract": "While large language models (LLMs) have demonstrated the ability to generate hardware description language (HDL) code for digital circuits, they still face the hallucination problem, which can result in the generation of incorrect HDL code or misinterpretation of specifications. In this work, we introduce a human-expert-inspired method to mitigate the hallucination of LLMs and enhance their performance in HDL code generation. We begin by constructing specialized paradigm blocks that consist of several steps designed to divide and conquer generation tasks, mirroring the design methodology of human experts. These steps include information extraction, human-like design flows, and the integration of external tools. LLMs are then instructed to classify the type of circuit in order to match it with the appropriate paradigm block and execute the block to generate the HDL codes. Additionally, we propose a two-phase workflow for multi-round generation, aimed at effectively improving the testbench pass rate of the generated HDL codes within a limited number of generation and verification rounds. Experimental results demonstrate that our method significantly enhances the functional correctness of the generated Verilog code", "authors": [], "published": "2025-01-22", "conference": null, "conference_url_abs": null, "conference_url_pdf": null, "proceeding": null }, { "id": "osum-advancing-open-speech-understanding", "arxiv_id": "2501.13306", "nips_id": null, "url_abs": "https://arxiv.org/abs/2501.13306v1", "url_pdf": "https://arxiv.org/pdf/2501.13306v1.pdf", "title": "OSUM: Advancing Open Speech Understanding Models with Limited Resources in Academia", "abstract": "Large Language Models (LLMs) have made significant progress in various downstream tasks, inspiring the development of Speech Understanding Language Models (SULMs) to enable comprehensive speech-based interactions. However, most advanced SULMs are developed by the industry, leveraging large-scale datasets and computational resources that are not readily available to the academic community. Moreover, the lack of transparency in training details creates additional barriers to further innovation. In this study, we present OSUM, an Open Speech Understanding Model designed to explore the potential of training SLUMs under constrained academic resources. The OSUM model combines a Whisper encoder with a Qwen2 LLM and supports a wide range of speech tasks, including speech recognition (ASR), speech recognition with timestamps (SRWT), vocal event detection (VED), speech emotion recognition (SER), speaking style recognition (SSR), speaker gender classification (SGC), speaker age prediction (SAP), and speech-to-text chat (STTC). By employing an ASR+X training strategy, OSUM achieves efficient and stable multi-task training by simultaneously optimizing ASR alongside target tasks. Beyond delivering strong performance, OSUM emphasizes transparency by providing openly available data preparation and training methodologies, offering valuable insights and practical guidance for the academic community. By doing so, we aim to accelerate research and innovation in advanced SULM technologies.", "authors": [], "published": "2025-01-23", "conference": null, "conference_url_abs": null, "conference_url_pdf": null, "proceeding": null }, { "id": "a-domain-adaptation-framework-for-speech", "arxiv_id": "2501.12501", "nips_id": null, "url_abs": "https://arxiv.org/abs/2501.12501v1", "url_pdf": "https://arxiv.org/pdf/2501.12501v1.pdf", "title": "A Domain Adaptation Framework for Speech Recognition Systems with Only Synthetic data", "abstract": "We introduce DAS (Domain Adaptation with Synthetic data), a novel domain adaptation framework for pre-trained ASR model, designed to efficiently adapt to various language-defined domains without requiring any real data. In particular, DAS first prompts large language models (LLMs) to generate domain-specific texts before converting these texts to speech via text-to-speech technology. The synthetic data is used to fine-tune Whisper with Low-Rank Adapters (LoRAs) for targeted domains such as music, weather, and sports. We introduce a novel one-pass decoding strategy that merges predictions from multiple LoRA adapters efficiently during the auto-regressive text generation process. Experimental results show significant improvements, reducing the Word Error Rate (WER) by 10% to 17% across all target domains compared to the original model, with minimal performance regression in out-of-domain settings (e.g., -1% on Librispeech test sets). We also demonstrate that DAS operates efficiently during inference, introducing an additional 9% increase in Real Time Factor (RTF) compared to the original model when inferring with three LoRA adapters.", "authors": [ "Xin Lei", "Xuedong Zhang", "Shun Zhang", "Ke Li", "Jinxi Guo", "Kevin Jiang", "Laxmi Pandey", "Debjyoti Paul", "Yutong Pang", "Minh Tran" ], "published": "2025-01-21", "conference": null, "conference_url_abs": null, "conference_url_pdf": null, "proceeding": null }, { "id": "rate-aware-learned-speech-compression", "arxiv_id": "2501.11999", "nips_id": null, "url_abs": "https://arxiv.org/abs/2501.11999v1", "url_pdf": "https://arxiv.org/pdf/2501.11999v1.pdf", "title": "Rate-Aware Learned Speech Compression", "abstract": "The rapid rise of real-time communication and large language models has significantly increased the importance of speech compression. Deep learning-based neural speech codecs have outperformed traditional signal-level speech codecs in terms of rate-distortion (RD) performance. Typically, these neural codecs employ an encoder-quantizer-decoder architecture, where audio is first converted into latent code feature representations and then into discrete tokens. However, this architecture exhibits insufficient RD performance due to two main drawbacks: (1) the inadequate performance of the quantizer, challenging training processes, and issues such as codebook collapse; (2) the limited representational capacity of the encoder and decoder, making it difficult to meet feature representation requirements across various bitrates. In this paper, we propose a rate-aware learned speech compression scheme that replaces the quantizer with an advanced channel-wise entropy model to improve RD performance, simplify training, and avoid codebook collapse. We employ multi-scale convolution and linear attention mixture blocks to enhance the representational capacity and flexibility of the encoder and decoder. Experimental results demonstrate that the proposed method achieves state-of-the-art RD performance, obtaining 53.51% BD-Rate bitrate saving in average, and achieves 0.26 BD-VisQol and 0.44 BD-PESQ gains.", "authors": [ "Li Song", "Yuelin Hu", "YuHan Liu", "Guangchuan Chi", "Zhengxue Cheng", "Jun Xu" ], "published": "2025-01-21", "conference": null, "conference_url_abs": null, "conference_url_pdf": null, "proceeding": null }, { "id": "empowering-aiops-leveraging-large-language", "arxiv_id": "2501.12461", "nips_id": null, "url_abs": "https://arxiv.org/abs/2501.12461v2", "url_pdf": "https://arxiv.org/pdf/2501.12461v2.pdf", "title": "Empowering AIOps: Leveraging Large Language Models for IT Operations Management", "abstract": "The integration of Artificial Intelligence (AI) into IT Operations Management (ITOM), commonly referred to as AIOps, offers substantial potential for automating workflows, enhancing efficiency, and supporting informed decision-making. However, implementing AI within IT operations is not without its challenges, including issues related to data quality, the complexity of IT environments, and skill gaps within teams. The advent of Large Language Models (LLMs) presents an opportunity to address some of these challenges, particularly through their advanced natural language understanding capabilities. These features enable organizations to process and analyze vast amounts of unstructured data, such as system logs, incident reports, and technical documentation. This ability aligns with the motivation behind our research, where we aim to integrate traditional predictive machine learning models with generative AI technologies like LLMs. By combining these approaches, we propose innovative methods to tackle persistent challenges in AIOps and enhance the capabilities of IT operations management.", "authors": [], "published": "2025-01-21", "conference": null, "conference_url_abs": null, "conference_url_pdf": null, "proceeding": null }, { "id": "an-empirical-study-of-retrieval-augmented-1", "arxiv_id": "2501.13742", "nips_id": null, "url_abs": "https://arxiv.org/abs/2501.13742v1", "url_pdf": "https://arxiv.org/pdf/2501.13742v1.pdf", "title": "An Empirical Study of Retrieval-Augmented Code Generation: Challenges and Opportunities", "abstract": "Code generation aims to automatically generate code snippets of specific programming language according to natural language descriptions. The continuous advancements in deep learning, particularly pre-trained models, have empowered the code generation task to achieve remarkable performance. One main challenge of pre-trained models for code generation is the semantic gap between natural language requirements and source code. To address the issue, prior studies typically adopt a retrieval-augmented framework for the task, where the similar code snippets collected by a retrieval process can be leveraged to help understand the requirements and provide guidance for the generation process. However, there is a lack of systematic study on the application of this framework for code generation, including the impact of the final generated results and the specific usage of the framework. In this paper, we choose three popular pre-trained code models, namely CodeGen, UniXcoder, and CodeT5, to assess the impact of the quality and utilization of retrieved code on the retrieval-augmented framework. Our analysis shows that the retrieval-augmented framework is beneficial for improving the performance of the existing pre-trained models. We also provide suggestions on the utilization of the retrieval-augmented code generation framework: BM25 and Sequential Integration Fusion are recommended due to their convenience and superior performance. Sketch Filling Fusion, which extracts a sketch of relevant code, could help the model improve its performance further. Additionally, we conduct experiments to investigate the influence of the retrieval-augmented framework on large language models for code generation, showing the effectiveness of the framework, and we discuss the trade-off between performance improvement and computational costs in each phase within the framework.", "authors": [], "published": "2025-01-23", "conference": null, "conference_url_abs": null, "conference_url_pdf": null, "proceeding": null }, { "id": "do-llms-provide-links-to-code-similar-to-what", "arxiv_id": "2501.12134", "nips_id": null, "url_abs": "https://arxiv.org/abs/2501.12134v1", "url_pdf": "https://arxiv.org/pdf/2501.12134v1.pdf", "title": "Do LLMs Provide Links to Code Similar to what they Generate? A Study with Gemini and Bing CoPilot", "abstract": "Large Language Models (LLMs) are currently used for various software development tasks, including generating code snippets to solve specific problems. Unlike reuse from the Web, LLMs are limited in providing provenance information about the generated code, which may have important trustworthiness and legal consequences. While LLM-based assistants may provide external links that are \"related\" to the generated code, we do not know how relevant such links are. This paper presents the findings of an empirical study assessing the extent to which 243 and 194 code snippets, across six programming languages, generated by Bing CoPilot and Google Gemini, likely originate from the links provided by these two LLM-based assistants. The study leverages automated code similarity assessments with thorough manual analysis. The study's findings indicate that the LLM-based assistants provide a mix of relevant and irrelevant links having a different nature. Specifically, although 66% of the links from Bing CoPilot and 28% from Google Gemini are relevant, LLMs-based assistants still suffer from serious \"provenance debt\".", "authors": [], "published": "2025-01-21", "conference": null, "conference_url_abs": null, "conference_url_pdf": null, "proceeding": null }, { "id": "towards-detecting-prompt-knowledge-gaps-for", "arxiv_id": "2501.11709", "nips_id": null, "url_abs": "https://arxiv.org/abs/2501.11709v2", "url_pdf": "https://arxiv.org/pdf/2501.11709v2.pdf", "title": "Towards Detecting Prompt Knowledge Gaps for Improved LLM-guided Issue Resolution", "abstract": "Large language models (LLMs) have become essential in software development, especially for issue resolution. However, despite their widespread use, significant challenges persist in the quality of LLM responses to issue resolution queries. LLM interactions often yield incorrect, incomplete, or ambiguous information, largely due to knowledge gaps in prompt design, which can lead to unproductive exchanges and reduced developer productivity. In this paper, we analyze 433 developer-ChatGPT conversations within GitHub issue threads to examine the impact of prompt knowledge gaps and conversation styles on issue resolution. We identify four main knowledge gaps in developer prompts: Missing Context, Missing Specifications, Multiple Context, and Unclear Instructions. Assuming that conversations within closed issues contributed to successful resolutions while those in open issues did not, we find that ineffective conversations contain knowledge gaps in 44.6% of prompts, compared to only 12.6% in effective ones. Additionally, we observe seven distinct conversational styles, with Directive Prompting, Chain of Thought, and Responsive Feedback being the most prevalent. We find that knowledge gaps are present in all styles of conversations, with Missing Context being the most repeated challenge developers face in issue-resolution conversations. Based on our analysis, we identify key textual and code-related heuristics (Specificity, Contextual Richness, and Clarity) that are associated with successful issue closure and help assess prompt quality. These heuristics lay the foundation for an automated tool that can dynamically flag unclear prompts and suggest structured improvements. To test feasibility, we developed a lightweight browser extension prototype for detecting prompt gaps, that can be easily adapted to other tools within developer workflows.", "authors": [], "published": "2025-01-20", "conference": null, "conference_url_abs": null, "conference_url_pdf": null, "proceeding": null }, { "id": "systems-engineering-for-autonomous-vehicles", "arxiv_id": "2501.10839", "nips_id": null, "url_abs": "https://arxiv.org/abs/2501.10839v1", "url_pdf": "https://arxiv.org/pdf/2501.10839v1.pdf", "title": "Systems Engineering for Autonomous Vehicles; Supervising AI using Large Language Models (SSuperLLM)", "abstract": "Generative Artificial Intelligence (GAI) and the idea to use hierarchical models has been around for some years now. GAI has proved to be an extremely useful tool for Autonomous Vehicles (AVs). AVs need to perform robustly in their environment. Thus the AV behavior and short-term trajectory planning needs to be: a) designed and architected using safeguarding and supervisory systems and b) verified using proper Systems Engineering (SysEng) Principles. Can AV Systems Engineering also use Large Language Models (LLM) to help Autonomous vehicles (AV) development? This reader-friendly paper advocates the use of LLMs in 1) requirements (Reqs) development and 2) Reqs verification and 3) provides a proof-of-concept of AV supervisory control. The latter uses a simulation environment of a simple planar (bicycle) vehicle dynamics model and a Linear Quadratic Regulator (LQR) control with an LLM Application Interface (API). The Open-Source simulation SW is available from the author accessible to the readers so that they can engage into the AV stack, LLM API and rules, SysEng and Reqs and fundamental vehicle dynamics and control.", "authors": [ "Diomidis Katzourakis" ], "published": "2025-01-18", "conference": null, "conference_url_abs": null, "conference_url_pdf": null, "proceeding": null }, { "id": "open-sourcing-gpts-economics-of-open-sourcing", "arxiv_id": "2501.11581", "nips_id": null, "url_abs": "https://arxiv.org/abs/2501.11581v1", "url_pdf": "https://arxiv.org/pdf/2501.11581v1.pdf", "title": "Open Sourcing GPTs: Economics of Open Sourcing Advanced AI Models", "abstract": "This paper explores the economic underpinnings of open sourcing advanced large language models (LLMs) by for-profit companies. Empirical analysis reveals that: (1) LLMs are compatible with R&D portfolios of numerous technologically differentiated firms; (2) open-sourcing likelihood decreases with an LLM's performance edge over rivals, but increases for models from large tech companies; and (3) open-sourcing an advanced LLM led to an increase in research-related activities. Motivated by these findings, a theoretical framework is developed to examine factors influencing a profit-maximizing firm's open-sourcing decision. The analysis frames this decision as a trade-off between accelerating technology growth and securing immediate financial returns. A key prediction from the theoretical analysis is an inverted-U-shaped relationship between the owner's size, measured by its share of LLM-compatible applications, and its propensity to open source the LLM. This finding suggests that moderate market concentration may be beneficial to the open source ecosystems of multi-purpose software technologies.", "authors": [ "Mahyar Habibi" ], "published": "2025-01-20", "conference": null, "conference_url_abs": null, "conference_url_pdf": null, "proceeding": null }, { "id": "calculus-for-the-modern-engineer-putting-the", "arxiv_id": "2501.10406", "nips_id": null, "url_abs": "https://arxiv.org/abs/2501.10406v1", "url_pdf": "https://arxiv.org/pdf/2501.10406v1.pdf", "title": "Calculus for the Modern Engineer: Putting the Joy Back in Learning Advanced Mathematics", "abstract": "Many engineering students enter college excited about math and physics, only to have their enthusiasm dimmed by a rigid, outdated calculus curriculum. The University of Michigan's Robotics Department is piloting a new 4-credit course, ``Calculus for the Modern Engineer,'' to reintroduce the excitement of learning advanced mathematics. Integrating Differential and Integral Calculus, vector derivatives, and Ordinary Differential Equations (ODEs) into a unified one-semester curriculum, the course emphasizes conceptual mastery and real-world applications. It starts with definite integration -- building on students' intuitive understanding of sums -- before progressing through limits, differentiation, antiderivatives, and ODEs. By leveraging computational tools like Julia, Large Language Models (LLMs), and Wolfram Alpha Pro, it reduces reliance on tedious hand calculations. Case studies in numerical integration, optimization, and feedback control connect theory to engineering challenges. Supported by an open-source textbook and programming assignments, the course equips students with modern computational skills and reignites their passion for mathematics.", "authors": [], "published": "2025-01-05", "conference": null, "conference_url_abs": null, "conference_url_pdf": null, "proceeding": null }, { "id": "intelligent-exercise-and-feedback-system-for", "arxiv_id": "2501.13723", "nips_id": null, "url_abs": "https://arxiv.org/abs/2501.13723v1", "url_pdf": "https://arxiv.org/pdf/2501.13723v1.pdf", "title": "Intelligent Exercise and Feedback System for Social Healthcare using LLMOps", "abstract": "This study addresses the growing demand for personalized feedback in healthcare platforms and social communities by introducing an LLMOps-based system for automated exercise analysis and personalized recommendations. Current healthcare platforms rely heavily on manual analysis and generic health advice, limiting user engagement and health promotion effectiveness. We developed a system that leverages Large Language Models (LLM) to automatically analyze user activity data from the \"Ounwan\" exercise recording community. The system integrates LLMOps with LLM APIs, containerized infrastructure, and CI/CD practices to efficiently process large-scale user activity data, identify patterns, and generate personalized recommendations. The architecture ensures scalability, reliability, and security for large-scale healthcare communities. Evaluation results demonstrate the system's effectiveness in three key metrics: exercise classification, duration prediction, and caloric expenditure estimation. This approach improves the efficiency of community management while providing more accurate and personalized feedback to users, addressing the limitations of traditional manual analysis methods.", "authors": [ "Hyung Soo Han", "TaeYoung Kim", "Yeongrak Choi" ], "published": "2025-01-22", "conference": null, "conference_url_abs": null, "conference_url_pdf": null, "proceeding": null } ] }{ "count": 24708, "next": "