Paper List
Return a paginated listing of all papers.
GET /api/v1/papers/?ordering=title&q=Large+Language+Models
https://paperswithcode.com/api/v1/papers/?ordering=title&page=2&q=Large+Language+Models", "previous": null, "results": [ { "id": "100-hallucination-elimination-using-acurai", "arxiv_id": "2412.05223", "nips_id": null, "url_abs": "https://arxiv.org/abs/2412.05223v1", "url_pdf": "https://arxiv.org/pdf/2412.05223v1.pdf", "title": "100% Hallucination Elimination Using Acurai", "abstract": "The issue of hallucinations in large language models (LLMs) remains a critical barrier to the adoption of AI in enterprise and other high-stakes applications. Despite advancements in retrieval-augmented generation (RAG) systems, current state-of-the-art methods fail to achieve more than 80% accuracy in generating faithful and factually correct outputs, even when provided with relevant and accurate context. In this work, we introduce Acurai, a novel systematic approach that achieves 100% hallucination-free responses in LLMs by reformatting queries and context data prior to input. Leveraging a deep understanding of LLM internal representations, the importance of noun-phrase dominance, and the role of discrete functional units (DFUs), Acurai ensures alignment between input context and generated output. We validate this method using the RAGTruth corpus, demonstrating its ability to eliminate 100% hallucinations for both GPT-4 and GPT-3.5 Turbo. Acurai sets a new standard for achieving consistent, accurate, and faithful AI responses, marking a significant step forward in the development of trustworthy AI systems.", "authors": [ "Adam A. Forbes", "Michael C. Wood" ], "published": "2024-12-06", "conference": null, "conference_url_abs": null, "conference_url_pdf": null, "proceeding": null }, { "id": "101-billion-arabic-words-dataset", "arxiv_id": "2405.01590", "nips_id": null, "url_abs": "https://arxiv.org/abs/2405.01590v1", "url_pdf": "https://arxiv.org/pdf/2405.01590v1.pdf", "title": "101 Billion Arabic Words Dataset", "abstract": "In recent years, Large Language Models have revolutionized the field of natural language processing, showcasing an impressive rise predominantly in English-centric domains. These advancements have set a global benchmark, inspiring significant efforts toward developing Arabic LLMs capable of understanding and generating the Arabic language with remarkable accuracy. Despite these advancements, a critical challenge persists: the potential bias in Arabic LLMs, primarily attributed to their reliance on datasets comprising English data that has been translated into Arabic. This reliance not only compromises the authenticity of the generated content but also reflects a broader issue -the scarcity of original quality Arabic linguistic data. This study aims to address the data scarcity in the Arab world and to encourage the development of Arabic Language Models that are true to both the linguistic and nuances of the region. We undertook a large-scale data mining project, extracting a substantial volume of text from the Common Crawl WET files, specifically targeting Arabic content. The extracted data underwent a rigorous cleaning and deduplication process, using innovative techniques to ensure the integrity and uniqueness of the dataset. The result is the 101 Billion Arabic Words Dataset, the largest Arabic dataset available to date, which can significantly contribute to the development of authentic Arabic LLMs. This study not only highlights the potential for creating linguistically and culturally accurate Arabic LLMs but also sets a precedent for future research in enhancing the authenticity of Arabic language models.", "authors": [ "Chehir Dhaouadi", "Haithem Kchaou", "Ghaith Chaabane", "Hasna Chouikhi", "Manel Aloui" ], "published": "2024-04-29", "conference": null, "conference_url_abs": null, "conference_url_pdf": null, "proceeding": null }, { "id": "1024m-at-smm4h-2024-tasks-3-5-6-ensembles-of", "arxiv_id": "2410.15998", "nips_id": null, "url_abs": "https://arxiv.org/abs/2410.15998v1", "url_pdf": "https://arxiv.org/pdf/2410.15998v1.pdf", "title": "1024m at SMM4H 2024: Tasks 3, 5 & 6 -- Ensembles of Transformers and Large Language Models for Medical Text Classification", "abstract": "Social media is a great source of data for users reporting information and regarding their health and how various things have had an effect on them. This paper presents various approaches using Transformers and Large Language Models and their ensembles, their performance along with advantages and drawbacks for various tasks of SMM4H'24 - Classifying texts on impact of nature and outdoor spaces on the author's mental health (Task 3), Binary classification of tweets reporting their children's health disorders like Asthma, Autism, ADHD and Speech disorder (task 5), Binary classification of users self-reporting their age (task 6).", "authors": [ "M. V. P. Chandra Sekhara Rao", "Ram Mohan Rao Kadiyala" ], "published": "2024-10-21", "conference": null, "conference_url_abs": null, "conference_url_pdf": null, "proceeding": null }, { "id": "1-1-2-can-large-language-models-serve-as", "arxiv_id": "2406.14721", "nips_id": null, "url_abs": "https://arxiv.org/abs/2406.14721v1", "url_pdf": "https://arxiv.org/pdf/2406.14721v1.pdf", "title": "1+1>2: Can Large Language Models Serve as Cross-Lingual Knowledge Aggregators?", "abstract": "Large Language Models (LLMs) have garnered significant attention due to their remarkable ability to process information across various languages. Despite their capabilities, they exhibit inconsistencies in handling identical queries in different languages, presenting challenges for further advancement. This paper introduces a method to enhance the multilingual performance of LLMs by aggregating knowledge from diverse languages. This approach incorporates a low-resource knowledge detector specific to a language, a language selection process, and mechanisms for answer replacement and integration. Our experiments demonstrate notable performance improvements, particularly in reducing language performance disparity. An ablation study confirms that each component of our method significantly contributes to these enhancements. This research highlights the inherent potential of LLMs to harmonize multilingual capabilities and offers valuable insights for further exploration.", "authors": [ "Lichao Sun", "Xiangliang Zhang", "Tianyi Zhou", "Siyuan Wu", "Yuan Li", "Chenrui Fan", "Yue Huang" ], "published": "2024-06-20", "conference": null, "conference_url_abs": null, "conference_url_pdf": null, "proceeding": null }, { "id": "14-examples-of-how-llms-can-transform", "arxiv_id": "2306.06283", "nips_id": null, "url_abs": "https://arxiv.org/abs/2306.06283v4", "url_pdf": "https://arxiv.org/pdf/2306.06283v4.pdf", "title": "14 Examples of How LLMs Can Transform Materials Science and Chemistry: A Reflection on a Large Language Model Hackathon", "abstract": "Large-language models (LLMs) such as GPT-4 caught the interest of many scientists. Recent studies suggested that these models could be useful in chemistry and materials science. To explore these possibilities, we organized a hackathon. This article chronicles the projects built as part of this hackathon. Participants employed LLMs for various applications, including predicting properties of molecules and materials, designing novel interfaces for tools, extracting knowledge from unstructured data, and developing new educational applications. The diverse topics and the fact that working prototypes could be generated in less than two days highlight that LLMs will profoundly impact the future of our fields. The rich collection of ideas and projects also indicates that the applications of LLMs are not limited to materials science and chemistry but offer potential benefits to a wide range of scientific disciplines.", "authors": [ "Joren Van Herck", "Nicolas Gastellu", "Matthew L. Evans", "Wibe A. de Jong", "Sam Cox", "Defne Circi", "Kamal Choudhary", "L. Catherine Brinson", "Stefan Bringuier", "Andres M Bran", "Joshua D. Bocarsly", "Ben Blaiszik", "Andrew D. White", "Ian Foster", "KJ Schmidt", "Aristana Scourtas", "Ghezal Ahmad Zia", "Xiaoqi Zhang", "Sylvester Zhang", "Benjamin Weiser", "Sean Warren", "Logan Ward", "Christoph Völker", "Ben E. Smith", "Berend Smit", "Jiale Shi", "Marcus Schwarting", "Philippe Schwaller", "Jacob N. Sanders", "Samuel G. Rodriques", "Bojana Ranković", "Mayk Caldas Ramos", "Michael Pieler", "Brenden Pelkie", "Beatriz Mouriño", "Elias Moubarak", "Nicolas Moitessier", "Garrett W. Merz", "Sauradeep Majumdar", "Steven Ma", "Tao Liu", "Jakub Lála", "Anne Labarre", "Sabine Kruschwitz", "Alishba Imran", "Zhi Hong", "Ankur K. Gupta", "María Victoria Gil", "Jerome Genzling", "Shruti Badhwar", "Alexander Al-Feghali", "Qianxiang Ai", "Kevin Maik Jablonka" ], "published": "2023-06-09", "conference": null, "conference_url_abs": null, "conference_url_pdf": null, "proceeding": null }, { "id": "1-800-shared-tasks-nlu-of-devanagari-script", "arxiv_id": "2411.06850", "nips_id": null, "url_abs": "https://arxiv.org/abs/2411.06850v1", "url_pdf": "https://arxiv.org/pdf/2411.06850v1.pdf", "title": "1-800-SHARED-TASKS @ NLU of Devanagari Script Languages: Detection of Language, Hate Speech, and Targets using LLMs", "abstract": "This paper presents a detailed system description of our entry for the CHiPSAL 2025 shared task, focusing on language detection, hate speech identification, and target detection in Devanagari script languages. We experimented with a combination of large language models and their ensembles, including MuRIL, IndicBERT, and Gemma-2, and leveraged unique techniques like focal loss to address challenges in the natural understanding of Devanagari languages, such as multilingual processing and class imbalance. Our approach achieved competitive results across all tasks: F1 of 0.9980, 0.7652, and 0.6804 for Sub-tasks A, B, and C respectively. This work provides insights into the effectiveness of transformer models in tasks with domain-specific and linguistic challenges, as well as areas for potential improvement in future iterations.", "authors": [ "Ram Mohan Rao Kadiyala", "Ashay Srivastava", "Drishti Sharma", "Muhammad Arham", "Kanwal Mehreen", "Siddartha Pullakhandam", "Jebish Purbey" ], "published": "2024-11-11", "conference": null, "conference_url_abs": null, "conference_url_pdf": null, "proceeding": null }, { "id": "1-bit-ai-infra-part-1-1-fast-and-lossless", "arxiv_id": "2410.16144", "nips_id": null, "url_abs": "https://arxiv.org/abs/2410.16144v2", "url_pdf": "https://arxiv.org/pdf/2410.16144v2.pdf", "title": "1-bit AI Infra: Part 1.1, Fast and Lossless BitNet b1.58 Inference on CPUs", "abstract": "Recent advances in 1-bit Large Language Models (LLMs), such as BitNet and BitNet b1.58, present a promising approach to enhancing the efficiency of LLMs in terms of speed and energy consumption. These developments also enable local LLM deployment across a broad range of devices. In this work, we introduce bitnet.cpp, a tailored software stack designed to unlock the full potential of 1-bit LLMs. Specifically, we develop a set of kernels to support fast and lossless inference of ternary BitNet b1.58 LLMs on CPUs. Extensive experiments demonstrate that bitnet.cpp achieves significant speedups, ranging from 2.37x to 6.17x on x86 CPUs and from 1.37x to 5.07x on ARM CPUs, across various model sizes. The code is available at https://github.com/microsoft/BitNet.", "authors": [ "Furu Wei", "Yan Xia", "Hongyu Wang", "Shuming Ma", "Shaoguang Mao", "Ting Song", "Hansong Zhou", "Jinheng Wang" ], "published": "2024-10-21", "conference": null, "conference_url_abs": null, "conference_url_pdf": null, "proceeding": null }, { "id": "1-trillion-token-1tt-platform-a-novel", "arxiv_id": "2409.20149", "nips_id": null, "url_abs": "https://arxiv.org/abs/2409.20149v1", "url_pdf": "https://arxiv.org/pdf/2409.20149v1.pdf", "title": "1 Trillion Token (1TT) Platform: A Novel Framework for Efficient Data Sharing and Compensation in Large Language Models", "abstract": "In this paper, we propose the 1 Trillion Token Platform (1TT Platform), a novel framework designed to facilitate efficient data sharing with a transparent and equitable profit-sharing mechanism. The platform fosters collaboration between data contributors, who provide otherwise non-disclosed datasets, and a data consumer, who utilizes these datasets to enhance their own services. Data contributors are compensated in monetary terms, receiving a share of the revenue generated by the services of the data consumer. The data consumer is committed to sharing a portion of the revenue with contributors, according to predefined profit-sharing arrangements. By incorporating a transparent profit-sharing paradigm to incentivize large-scale data sharing, the 1TT Platform creates a collaborative environment to drive the advancement of NLP and LLM technologies.", "authors": [ "Seonghoon Yang", "Sukyung Lee", "Dahyun Kim", "Yungi Kim", "Jihoo Kim", "Hyunsoo Ha", "Chanjun Park" ], "published": "2024-09-30", "conference": null, "conference_url_abs": null, "conference_url_pdf": null, "proceeding": null }, { "id": "2d-dpo-scaling-direct-preference-optimization", "arxiv_id": "2410.19720", "nips_id": null, "url_abs": "https://arxiv.org/abs/2410.19720v1", "url_pdf": "https://arxiv.org/pdf/2410.19720v1.pdf", "title": "2D-DPO: Scaling Direct Preference Optimization with 2-Dimensional Supervision", "abstract": "Recent advancements in Direct Preference Optimization (DPO) have significantly enhanced the alignment of Large Language Models (LLMs) with human preferences, owing to its simplicity and effectiveness. However, existing methods typically optimize a scalar score or ranking reward, thereby overlooking the multi-dimensional nature of human preferences. In this work, we propose to extend the preference of DPO to two dimensions: segments and aspects. We first introduce a 2D supervision dataset called HelpSteer-2D. For the segment dimension, we divide the response into sentences and assign scores to each segment. For the aspect dimension, we meticulously design several criteria covering the response quality rubrics. With the 2-dimensional signals as feedback, we develop a 2D-DPO framework, decomposing the overall objective into multi-segment and multi-aspect objectives. Extensive experiments on popular benchmarks demonstrate that 2D-DPO performs better than methods that optimize for scalar or 1-dimensional preferences.", "authors": [ "Bo Zheng", "Wenbo Su", "Jihao Gu", "Weixun Wang", "Hangyu Guo", "Jiaheng Liu", "Xingyuan Bu", "Hui Huang", "Yancheng He", "Shilong Li" ], "published": "2024-10-25", "conference": null, "conference_url_abs": null, "conference_url_pdf": null, "proceeding": null }, { "id": "2d-tpe-two-dimensional-positional-encoding", "arxiv_id": "2409.19700", "nips_id": null, "url_abs": "https://arxiv.org/abs/2409.19700v3", "url_pdf": "https://arxiv.org/pdf/2409.19700v3.pdf", "title": "2D-TPE: Two-Dimensional Positional Encoding Enhances Table Understanding for Large Language Models", "abstract": "Tables are ubiquitous across various domains for concisely representing structured information. Empowering large language models (LLMs) to reason over tabular data represents an actively explored direction. However, since typical LLMs only support one-dimensional~(1D) inputs, existing methods often flatten the two-dimensional~(2D) table structure into a sequence of tokens, which can severely disrupt the spatial relationships and result in an inevitable loss of vital contextual information. In this paper, we first empirically demonstrate the detrimental impact of such flattening operations on the performance of LLMs in capturing the spatial information of tables through two elaborate proxy tasks. Subsequently, we introduce a simple yet effective positional encoding method, termed ``2D-TPE'' (Two-Dimensional Table Positional Encoding), to address this challenge. 2D-TPE enables each attention head to dynamically select a permutation order of tokens within the context for attending to them, where each permutation represents a distinct traversal mode for the table, such as column-wise or row-wise traversal. 2D-TPE effectively mitigates the risk of losing essential spatial information while preserving computational efficiency, thus better preserving the table structure. Extensive experiments across five benchmarks demonstrate that 2D-TPE outperforms strong baselines, underscoring the importance of preserving the table structure for accurate table comprehension. Comprehensive analysis further reveals the substantially better scalability of 2D-TPE to large tables than baselines.", "authors": [ "Rui Yan", "Zhengtao Yu", "Wei Wu", "Jian Guan", "Jia-Nan Li" ], "published": "2024-09-29", "conference": null, "conference_url_abs": null, "conference_url_pdf": null, "proceeding": null }, { "id": "2ssp-a-two-stage-framework-for-structured", "arxiv_id": "2501.17771", "nips_id": null, "url_abs": "https://arxiv.org/abs/2501.17771v1", "url_pdf": "https://arxiv.org/pdf/2501.17771v1.pdf", "title": "2SSP: A Two-Stage Framework for Structured Pruning of LLMs", "abstract": "We propose a novel Two-Stage framework for Structured Pruning (2SSP) for pruning Large Language Models (LLMs), which combines two different strategies of pruning, namely Width and Depth Pruning. The first stage (Width Pruning) removes entire neurons, hence their corresponding rows and columns, aiming to preserve the connectivity among the pruned structures in the intermediate state of the Feed-Forward Networks in each Transformer block. This is done based on an importance score measuring the impact of each neuron over the output magnitude. The second stage (Depth Pruning), instead, removes entire Attention submodules. This is done by applying an iterative process that removes the Attention submodules with the minimum impact on a given metric of interest (in our case, perplexity). We also propose a novel mechanism to balance the sparsity rate of the two stages w.r.t. to the desired global sparsity. We test 2SSP on four LLM families and three sparsity rates (25\\%, 37.5\\%, and 50\\%), measuring the resulting perplexity over three language modeling datasets as well as the performance over six downstream tasks. Our method consistently outperforms five state-of-the-art competitors over three language modeling and six downstream tasks, with an up to two-order-of-magnitude gain in terms of pruning time. The code is available at available at \\url{https://github.com/FabrizioSandri/2SSP}.", "authors": [ "Giovanni Iacca", "Elia Cunegatti", "Fabrizio Sandri" ], "published": "2025-01-29", "conference": null, "conference_url_abs": null, "conference_url_pdf": null, "proceeding": null }, { "id": "3dbench-a-scalable-3d-benchmark-and", "arxiv_id": "2404.14678", "nips_id": null, "url_abs": "https://arxiv.org/abs/2404.14678v1", "url_pdf": "https://arxiv.org/pdf/2404.14678v1.pdf", "title": "3DBench: A Scalable 3D Benchmark and Instruction-Tuning Dataset", "abstract": "Evaluating the performance of Multi-modal Large Language Models (MLLMs), integrating both point cloud and language, presents significant challenges. The lack of a comprehensive assessment hampers determining whether these models truly represent advancements, thereby impeding further progress in the field. Current evaluations heavily rely on classification and caption tasks, falling short in providing a thorough assessment of MLLMs. A pressing need exists for a more sophisticated evaluation method capable of thoroughly analyzing the spatial understanding and expressive capabilities of these models. To address these issues, we introduce a scalable 3D benchmark, accompanied by a large-scale instruction-tuning dataset known as 3DBench, providing an extensible platform for a comprehensive evaluation of MLLMs. Specifically, we establish the benchmark that spans a wide range of spatial and semantic scales, from object-level to scene-level, addressing both perception and planning tasks. Furthermore, we present a rigorous pipeline for automatically constructing scalable 3D instruction-tuning datasets, covering 10 diverse multi-modal tasks with more than 0.23 million QA pairs generated in total. Thorough experiments evaluating trending MLLMs, comparisons against existing datasets, and variations of training protocols demonstrate the superiority of 3DBench, offering valuable insights into current limitations and potential research directions.", "authors": [ "Dan Zeng", "Yongshun Gong", "Xiaoshui Huang", "Tianci Hu", "Junjie Zhang" ], "published": "2024-04-23", "conference": null, "conference_url_abs": null, "conference_url_pdf": null, "proceeding": null }, { "id": "3d-building-generation-in-minecraft-via-large", "arxiv_id": "2406.08751", "nips_id": null, "url_abs": "https://arxiv.org/abs/2406.08751v1", "url_pdf": "https://arxiv.org/pdf/2406.08751v1.pdf", "title": "3D Building Generation in Minecraft via Large Language Models", "abstract": "Recently, procedural content generation has exhibited considerable advancements in the domain of 2D game level generation such as Super Mario Bros. and Sokoban through large language models (LLMs). To further validate the capabilities of LLMs, this paper explores how LLMs contribute to the generation of 3D buildings in a sandbox game, Minecraft. We propose a Text to Building in Minecraft (T2BM) model, which involves refining prompts, decoding interlayer representation and repairing. Facade, indoor scene and functional blocks like doors are supported in the generation. Experiments are conducted to evaluate the completeness and satisfaction of buildings generated via LLMs. It shows that LLMs hold significant potential for 3D building generation. Given appropriate prompts, LLMs can generate correct buildings in Minecraft with complete structures and incorporate specific building blocks such as windows and beds, meeting the specified requirements of human users.", "authors": [ "Jialin Liu", "Chengpeng Hu", "Zengrong Huang", "Shiying Hu" ], "published": "2024-06-13", "conference": null, "conference_url_abs": null, "conference_url_pdf": null, "proceeding": null }, { "id": "3d-gpt-procedural-3d-modeling-with-large", "arxiv_id": "2310.12945", "nips_id": null, "url_abs": "https://arxiv.org/abs/2310.12945v2", "url_pdf": "https://arxiv.org/pdf/2310.12945v2.pdf", "title": "3D-GPT: Procedural 3D Modeling with Large Language Models", "abstract": "In the pursuit of efficient automated content creation, procedural generation, leveraging modifiable parameters and rule-based systems, emerges as a promising approach. Nonetheless, it could be a demanding endeavor, given its intricate nature necessitating a deep understanding of rules, algorithms, and parameters. To reduce workload, we introduce 3D-GPT, a framework utilizing large language models~(LLMs) for instruction-driven 3D modeling. 3D-GPT positions LLMs as proficient problem solvers, dissecting the procedural 3D modeling tasks into accessible segments and appointing the apt agent for each task. 3D-GPT integrates three core agents: the task dispatch agent, the conceptualization agent, and the modeling agent. They collaboratively achieve two objectives. First, it enhances concise initial scene descriptions, evolving them into detailed forms while dynamically adapting the text based on subsequent instructions. Second, it integrates procedural generation, extracting parameter values from enriched text to effortlessly interface with 3D software for asset creation. Our empirical investigations confirm that 3D-GPT not only interprets and executes instructions, delivering reliable results but also collaborates effectively with human designers. Furthermore, it seamlessly integrates with Blender, unlocking expanded manipulation possibilities. Our work highlights the potential of LLMs in 3D modeling, offering a basic framework for future advancements in scene generation and animation.", "authors": [ "Stephen Gould", "Zishan Qin", "Xinlong Wang", "Weijian Deng", "Junlin Han", "Chunyi Sun" ], "published": "2023-10-19", "conference": null, "conference_url_abs": null, "conference_url_pdf": null, "proceeding": null }, { "id": "3d-grand-towards-better-grounding-and-less", "arxiv_id": "2406.05132", "nips_id": null, "url_abs": "https://arxiv.org/abs/2406.05132v2", "url_pdf": "https://arxiv.org/pdf/2406.05132v2.pdf", "title": "3D-GRAND: A Million-Scale Dataset for 3D-LLMs with Better Grounding and Less Hallucination", "abstract": "The integration of language and 3D perception is crucial for developing embodied agents and robots that comprehend and interact with the physical world. While large language models (LLMs) have demonstrated impressive language understanding and generation capabilities, their adaptation to 3D environments (3D-LLMs) remains in its early stages. A primary challenge is the absence of large-scale datasets that provide dense grounding between language and 3D scenes. In this paper, we introduce 3D-GRAND, a pioneering large-scale dataset comprising 40,087 household scenes paired with 6.2 million densely-grounded scene-language instructions. Our results show that instruction tuning with 3D-GRAND significantly enhances grounding capabilities and reduces hallucinations in 3D-LLMs. As part of our contributions, we propose a comprehensive benchmark 3D-POPE to systematically evaluate hallucination in 3D-LLMs, enabling fair comparisons among future models. Our experiments highlight a scaling effect between dataset size and 3D-LLM performance, emphasizing the critical role of large-scale 3D-text datasets in advancing embodied AI research. Notably, our results demonstrate early signals for effective sim-to-real transfer, indicating that models trained on large synthetic data can perform well on real-world 3D scans. Through 3D-GRAND and 3D-POPE, we aim to equip the embodied AI community with essential resources and insights, setting the stage for more reliable and better-grounded 3D-LLMs. Project website: https://3d-grand.github.io", "authors": [ "Joyce Chai", "David F. Fouhey", "Shengyi Qian", "Madhavan Iyengar", "Nikhil Madaan", "Xuweiyi Chen", "Jianing Yang" ], "published": "2024-06-07", "conference": null, "conference_url_abs": null, "conference_url_pdf": null, "proceeding": null }, { "id": "3dgraphllm-combining-semantic-graphs-and", "arxiv_id": "2412.18450", "nips_id": null, "url_abs": "https://arxiv.org/abs/2412.18450v2", "url_pdf": "https://arxiv.org/pdf/2412.18450v2.pdf", "title": "3DGraphLLM: Combining Semantic Graphs and Large Language Models for 3D Scene Understanding", "abstract": "A 3D scene graph represents a compact scene model, storing information about the objects and the semantic relationships between them, making its use promising for robotic tasks. When interacting with a user, an embodied intelligent agent should be capable of responding to various queries about the scene formulated in natural language. Large Language Models (LLMs) are beneficial solutions for user-robot interaction due to their natural language understanding and reasoning abilities. Recent methods for creating learnable representations of 3D scenes have demonstrated the potential to improve the quality of LLMs responses by adapting to the 3D world. However, the existing methods do not explicitly utilize information about the semantic relationships between objects, limiting themselves to information about their coordinates. In this work, we propose a method 3DGraphLLM for constructing a learnable representation of a 3D scene graph. The learnable representation is used as input for LLMs to perform 3D vision-language tasks. In our experiments on popular ScanRefer, RIORefer, Multi3DRefer, ScanQA, Sqa3D, and Scan2cap datasets, we demonstrate the advantage of this approach over baseline methods that do not use information about the semantic relationships between objects. The code is publicly available at https://github.com/CognitiveAISystems/3DGraphLLM.", "authors": [ "Dmitry Yudin", "Tatiana Zemskova" ], "published": "2024-12-24", "conference": null, "conference_url_abs": null, "conference_url_pdf": null, "proceeding": null }, { "id": "3d-llm-injecting-the-3d-world-into-large", "arxiv_id": "2307.12981", "nips_id": null, "url_abs": "https://arxiv.org/abs/2307.12981v1", "url_pdf": "https://arxiv.org/pdf/2307.12981v1.pdf", "title": "3D-LLM: Injecting the 3D World into Large Language Models", "abstract": "Large language models (LLMs) and Vision-Language Models (VLMs) have been proven to excel at multiple tasks, such as commonsense reasoning. Powerful as these models can be, they are not grounded in the 3D physical world, which involves richer concepts such as spatial relationships, affordances, physics, layout, and so on. In this work, we propose to inject the 3D world into large language models and introduce a whole new family of 3D-LLMs. Specifically, 3D-LLMs can take 3D point clouds and their features as input and perform a diverse set of 3D-related tasks, including captioning, dense captioning, 3D question answering, task decomposition, 3D grounding, 3D-assisted dialog, navigation, and so on. Using three types of prompting mechanisms that we design, we are able to collect over 300k 3D-language data covering these tasks. To efficiently train 3D-LLMs, we first utilize a 3D feature extractor that obtains 3D features from rendered multi- view images. Then, we use 2D VLMs as our backbones to train our 3D-LLMs. By introducing a 3D localization mechanism, 3D-LLMs can better capture 3D spatial information. Experiments on ScanQA show that our model outperforms state-of-the-art baselines by a large margin (e.g., the BLEU-1 score surpasses state-of-the-art score by 9%). Furthermore, experiments on our held-in datasets for 3D captioning, task composition, and 3D-assisted dialogue show that our model outperforms 2D VLMs. Qualitative examples also show that our model could perform more tasks beyond the scope of existing LLMs and VLMs. Project Page: : https://vis-www.cs.umass.edu/3dllm/.", "authors": [ "Chuang Gan", "Zhenfang Chen", "Yilun Du", "Shuhong Zheng", "Peihao Chen", "Haoyu Zhen", "Yining Hong" ], "published": "2023-07-24", "conference": null, "conference_url_abs": "https://openreview.net/forum?id=YQA28p7qNz", "conference_url_pdf": "https://openreview.net/pdf?id=YQA28p7qNz", "proceeding": "neurips-2023-11" }, { "id": "3dmit-3d-multi-modal-instruction-tuning-for", "arxiv_id": "2401.03201", "nips_id": null, "url_abs": "https://arxiv.org/abs/2401.03201v2", "url_pdf": "https://arxiv.org/pdf/2401.03201v2.pdf", "title": "3DMIT: 3D Multi-modal Instruction Tuning for Scene Understanding", "abstract": "The remarkable potential of multi-modal large language models (MLLMs) in comprehending both vision and language information has been widely acknowledged. However, the scarcity of 3D scenes-language pairs in comparison to their 2D counterparts, coupled with the inadequacy of existing approaches in understanding of 3D scenes by LLMs, poses a significant challenge. In response, we collect and construct an extensive dataset comprising 75K instruction-response pairs tailored for 3D scenes. This dataset addresses tasks related to 3D VQA, 3D grounding, and 3D conversation. To further enhance the integration of 3D spatial information into LLMs, we introduce a novel and efficient prompt tuning paradigm, 3DMIT. This paradigm eliminates the alignment stage between 3D scenes and language and extends the instruction prompt with the 3D modality information including the entire scene and segmented objects. We evaluate the effectiveness of our method across diverse tasks in the 3D scene domain and find that our approach serves as a strategic means to enrich LLMs' comprehension of the 3D world. Our code is available at https://github.com/staymylove/3DMIT.", "authors": [ "Xiangde Liu", "Ruifei Ma", "Yifan Xu", "Ruilong Ren", "Xiaoyan Wang", "Chao Zhang", "Zeju Li" ], "published": "2024-01-06", "conference": null, "conference_url_abs": null, "conference_url_pdf": null, "proceeding": null }, { "id": "3d-moe-a-mixture-of-experts-multi-modal-llm", "arxiv_id": "2501.16698", "nips_id": null, "url_abs": "https://arxiv.org/abs/2501.16698v1", "url_pdf": "https://arxiv.org/pdf/2501.16698v1.pdf", "title": "3D-MoE: A Mixture-of-Experts Multi-modal LLM for 3D Vision and Pose Diffusion via Rectified Flow", "abstract": "3D vision and spatial reasoning have long been recognized as preferable for accurately perceiving our three-dimensional world, especially when compared with traditional visual reasoning based on 2D images. Due to the difficulties in collecting high-quality 3D data, research in this area has only recently gained momentum. With the advent of powerful large language models (LLMs), multi-modal LLMs for 3D vision have been developed over the past few years. However, most of these models focus primarily on the vision encoder for 3D data. In this paper, we propose converting existing densely activated LLMs into mixture-of-experts (MoE) models, which have proven effective for multi-modal data processing. In addition to leveraging these models' instruction-following capabilities, we further enable embodied task planning by attaching a diffusion head, Pose-DiT, that employs a novel rectified flow diffusion scheduler. Experimental results on 3D question answering and task-planning tasks demonstrate that our 3D-MoE framework achieves improved performance with fewer activated parameters.", "authors": [ "Irwin King", "Jianye Hao", "Yuzheng Zhuang", "Yueen Ma" ], "published": "2025-01-28", "conference": null, "conference_url_abs": null, "conference_url_pdf": null, "proceeding": null }, { "id": "3d-premise-can-large-language-models-generate", "arxiv_id": "2401.06437", "nips_id": null, "url_abs": "https://arxiv.org/abs/2401.06437v1", "url_pdf": "https://arxiv.org/pdf/2401.06437v1.pdf", "title": "3D-PreMise: Can Large Language Models Generate 3D Shapes with Sharp Features and Parametric Control?", "abstract": "Recent advancements in implicit 3D representations and generative models have markedly propelled the field of 3D object generation forward. However, it remains a significant challenge to accurately model geometries with defined sharp features under parametric controls, which is crucial in fields like industrial design and manufacturing. To bridge this gap, we introduce a framework that employs Large Language Models (LLMs) to generate text-driven 3D shapes, manipulating 3D software via program synthesis. We present 3D-PreMise, a dataset specifically tailored for 3D parametric modeling of industrial shapes, designed to explore state-of-the-art LLMs within our proposed pipeline. Our work reveals effective generation strategies and delves into the self-correction capabilities of LLMs using a visual interface. Our work highlights both the potential and limitations of LLMs in 3D parametric modeling for industrial applications.", "authors": [ "Junbo Zhao", "Qiang Zou", "Haoxuan Lan", "Zeqing Yuan" ], "published": "2024-01-12", "conference": null, "conference_url_abs": null, "conference_url_pdf": null, "proceeding": null }, { "id": "3d-properties-identifying-challenges-in-dpo", "arxiv_id": "2406.07327", "nips_id": null, "url_abs": "https://arxiv.org/abs/2406.07327v1", "url_pdf": "https://arxiv.org/pdf/2406.07327v1.pdf", "title": "3D-Properties: Identifying Challenges in DPO and Charting a Path Forward", "abstract": "Aligning large language models (LLMs) with human preference has recently gained tremendous attention, with the canonical yet costly RLHF-PPO and the simple and straightforward Direct Preference Optimization (DPO) as two examples. Despite the efficiency, DPO has rarely be used in the state-of-the-art production-level LLMs, implying its potential pathologies. In this work, we revisit DPO with a comprehensive examination of its empirical efficacy and a systematic comparison with RLHF-PPO. We identify the \\textbf{3D}-properties of DPO's learning outcomes: the \\textbf{D}rastic drop in the likelihood of rejected responses, the \\textbf{D}egradation into LLM unlearning, and the \\textbf{D}ispersion effect on unseen responses through experiments with both a carefully designed toy model and practical LLMs on tasks including mathematical problem-solving and instruction following. These findings inherently connect to some observations made by related works and we additionally contribute a plausible theoretical explanation for them. Accordingly, we propose easy regularization methods to mitigate the issues caused by \\textbf{3D}-properties, improving the training stability and final performance of DPO. Our contributions also include an investigation into how the distribution of the paired preference data impacts the effectiveness of DPO. We hope this work could offer research directions to narrow the gap between reward-free preference learning methods and reward-based ones.", "authors": [ "Dong Yan", "Zhijie Deng", "Jian Xie", "Yipin Zhang", "Jialian Li", "Yibo Miao", "Yuzi Yan" ], "published": "2024-06-11", "conference": null, "conference_url_abs": null, "conference_url_pdf": null, "proceeding": null }, { "id": "3d-question-answering-for-city-scene", "arxiv_id": "2407.17398", "nips_id": null, "url_abs": "https://arxiv.org/abs/2407.17398v1", "url_pdf": "https://arxiv.org/pdf/2407.17398v1.pdf", "title": "3D Question Answering for City Scene Understanding", "abstract": "3D multimodal question answering (MQA) plays a crucial role in scene understanding by enabling intelligent agents to comprehend their surroundings in 3D environments. While existing research has primarily focused on indoor household tasks and outdoor roadside autonomous driving tasks, there has been limited exploration of city-level scene understanding tasks. Furthermore, existing research faces challenges in understanding city scenes, due to the absence of spatial semantic information and human-environment interaction information at the city level.To address these challenges, we investigate 3D MQA from both dataset and method perspectives. From the dataset perspective, we introduce a novel 3D MQA dataset named City-3DQA for city-level scene understanding, which is the first dataset to incorporate scene semantic and human-environment interactive tasks within the city. From the method perspective, we propose a Scene graph enhanced City-level Understanding method (Sg-CityU), which utilizes the scene graph to introduce the spatial semantic. A new benchmark is reported and our proposed Sg-CityU achieves accuracy of 63.94 % and 63.76 % in different settings of City-3DQA. Compared to indoor 3D MQA methods and zero-shot using advanced large language models (LLMs), Sg-CityU demonstrates state-of-the-art (SOTA) performance in robustness and generalization.", "authors": [ "Xiaowen Chu", "Yang Yang", "Tiefeng Li", "Qiang Wang", "Xiaofei Yang", "Xiang Liu", "Yaoxian Song", "Penglei Sun" ], "published": "2024-07-24", "conference": null, "conference_url_abs": null, "conference_url_pdf": null, "proceeding": null }, { "id": "3d-representation-in-512-byte-variational", "arxiv_id": "2412.02202", "nips_id": null, "url_abs": "https://arxiv.org/abs/2412.02202v1", "url_pdf": "https://arxiv.org/pdf/2412.02202v1.pdf", "title": "3D representation in 512-Byte:Variational tokenizer is the key for autoregressive 3D generation", "abstract": "Autoregressive transformers have revolutionized high-fidelity image generation. One crucial ingredient lies in the tokenizer, which compresses high-resolution image patches into manageable discrete tokens with a scanning or hierarchical order suitable for large language models. Extending these tokenizers to 3D generation, however, presents a significant challenge: unlike image patches that naturally exhibit spatial sequence and multi-scale relationships, 3D data lacks an inherent order, making it difficult to compress into fewer tokens while preserving structural details. To address this, we introduce the Variational Tokenizer (VAT), which transforms unordered 3D data into compact latent tokens with an implicit hierarchy, suited for efficient and high-fidelity coarse-to-fine autoregressive modeling. VAT begins with an in-context transformer, which compress numerous unordered 3D features into a reduced token set with minimal information loss. This latent space is then mapped to a Gaussian distribution for residual quantization, with token counts progressively increasing across scales. In this way, tokens at different scales naturally establish the interconnections by allocating themselves into different subspaces within the same Gaussian distribution, facilitating discrete modeling of token relationships across scales. During the decoding phase, a high-resolution triplane is utilized to convert these compact latent tokens into detailed 3D shapes. Extensive experiments demonstrate that VAT enables scalable and efficient 3D generation, outperforming existing methods in quality, efficiency, and generalization. Remarkably, VAT achieves up to a 250x compression, reducing a 1MB mesh to just 3.9KB with a 96% F-score, and can further compress to 256 int8 tokens, achieving a 2000x reduction while maintaining a 92% F-score.", "authors": [ "Mu Xu", "Feng Xiong", "Jinzhi Zhang" ], "published": "2024-12-03", "conference": null, "conference_url_abs": null, "conference_url_pdf": null, "proceeding": null }, { "id": "3ds-decomposed-difficulty-data-selection-s", "arxiv_id": "2410.10901", "nips_id": null, "url_abs": "https://arxiv.org/abs/2410.10901v1", "url_pdf": "https://arxiv.org/pdf/2410.10901v1.pdf", "title": "3DS: Decomposed Difficulty Data Selection's Case Study on LLM Medical Domain Adaptation", "abstract": "Large Language Models(LLMs) excel in general tasks but struggle in specialized domains like healthcare due to limited domain-specific knowledge.Supervised Fine-Tuning(SFT) data construction for domain adaptation often relies on heuristic methods, such as GPT-4 annotation or manual data selection, with a data-centric focus on presumed diverse, high-quality datasets. However, these methods overlook the model's inherent knowledge distribution, introducing noise, redundancy, and irrelevant data, leading to a mismatch between the selected data and the model's learning task, resulting in suboptimal performance. To address this, we propose a two-stage model-centric data selection framework, Decomposed Difficulty Data Selection (3DS), which aligns data with the model's knowledge distribution for optimized adaptation. In Stage1, we apply Prompt-Driven Data Selection via Explicit Alignment, where the the model filters irrelevant or redundant data based on its internal knowledge. In Stage2, we perform Decomposed Difficulty Data Selection, where data selection is guided by our defined difficulty decomposition, using three metrics: Instruction Understanding, Response Confidence, and Response Correctness. Additionally, an attention-based importance weighting mechanism captures token importance for more accurate difficulty calibration. This two-stage approach ensures the selected data is not only aligned with the model's knowledge and preferences but also appropriately challenging for the model to learn, leading to more effective and targeted domain adaptation. In the case study of the medical domain, our extensive experiments on real-world healthcare datasets demonstrate the superiority of 3DS over exisiting methods in accuracy by over 5.29%. Our dataset and code will be open-sourced at https://anonymous.4open.science/r/3DS-E67F.", "authors": [ "Yasha Wang", "Junfeng Zhao", "Xu Chu", "Yongxin Xu", "Jinyang Zhang", "Xinke Jiang", "Runchuan Zhu", "Yue Fang", "Hongxin Ding" ], "published": "2024-10-13", "conference": null, "conference_url_abs": null, "conference_url_pdf": null, "proceeding": null }, { "id": "3d-spatial-understanding-in-mllms", "arxiv_id": "2412.06613", "nips_id": null, "url_abs": "https://arxiv.org/abs/2412.06613v1", "url_pdf": "https://arxiv.org/pdf/2412.06613v1.pdf", "title": "3D Spatial Understanding in MLLMs: Disambiguation and Evaluation", "abstract": "Multimodal Large Language Models (MLLMs) have made significant progress in tasks such as image captioning and question answering. However, while these models can generate realistic captions, they often struggle with providing precise instructions, particularly when it comes to localizing and disambiguating objects in complex 3D environments. This capability is critical as MLLMs become more integrated with collaborative robotic systems. In scenarios where a target object is surrounded by similar objects (distractors), robots must deliver clear, spatially-aware instructions to guide humans effectively. We refer to this challenge as contextual object localization and disambiguation, which imposes stricter constraints than conventional 3D dense captioning, especially regarding ensuring target exclusivity. In response, we propose simple yet effective techniques to enhance the model's ability to localize and disambiguate target objects. Our approach not only achieves state-of-the-art performance on conventional metrics that evaluate sentence similarity, but also demonstrates improved 3D spatial understanding through 3D visual grounding model.", "authors": [ "Didier Stricker", "Alain Pagani", "Chun-Peng Chang" ], "published": "2024-12-09", "conference": null, "conference_url_abs": null, "conference_url_pdf": null, "proceeding": null }, { "id": "3dtopia-large-text-to-3d-generation-model", "arxiv_id": "2403.02234", "nips_id": null, "url_abs": "https://arxiv.org/abs/2403.02234v2", "url_pdf": "https://arxiv.org/pdf/2403.02234v2.pdf", "title": "3DTopia: Large Text-to-3D Generation Model with Hybrid Diffusion Priors", "abstract": "We present a two-stage text-to-3D generation system, namely 3DTopia, which generates high-quality general 3D assets within 5 minutes using hybrid diffusion priors. The first stage samples from a 3D diffusion prior directly learned from 3D data. Specifically, it is powered by a text-conditioned tri-plane latent diffusion model, which quickly generates coarse 3D samples for fast prototyping. The second stage utilizes 2D diffusion priors to further refine the texture of coarse 3D models from the first stage. The refinement consists of both latent and pixel space optimization for high-quality texture generation. To facilitate the training of the proposed system, we clean and caption the largest open-source 3D dataset, Objaverse, by combining the power of vision language models and large language models. Experiment results are reported qualitatively and quantitatively to show the performance of the proposed system. Our codes and models are available at https://github.com/3DTopia/3DTopia", "authors": [ "Shuai Yang", "Ziwei Liu", "Dahua Lin", "Liang Pan", "Tengfei Wang", "Zhaoxi Chen", "Tong Wu", "Min Shi", "Ziang Cao", "Jiaxiang Tang", "Fangzhou Hong" ], "published": "2024-03-04", "conference": null, "conference_url_abs": null, "conference_url_pdf": null, "proceeding": null }, { "id": "3-in-1-2d-rotary-adaptation-for-efficient", "arxiv_id": "2409.00119", "nips_id": null, "url_abs": "https://arxiv.org/abs/2409.00119v2", "url_pdf": "https://arxiv.org/pdf/2409.00119v2.pdf", "title": "3-in-1: 2D Rotary Adaptation for Efficient Finetuning, Efficient Batching and Composability", "abstract": "Parameter-efficient finetuning (PEFT) methods effectively adapt large language models (LLMs) to diverse downstream tasks, reducing storage and GPU memory demands. Despite these advantages, several applications pose new challenges to PEFT beyond mere parameter efficiency. One notable challenge involves the efficient deployment of LLMs equipped with multiple task- or user-specific adapters, particularly when different adapters are needed for distinct requests within the same batch. Another challenge is the interpretability of LLMs, which is crucial for understanding how LLMs function. Previous studies introduced various approaches to address different challenges. In this paper, we introduce a novel method, RoAd, which employs a straightforward 2D rotation to adapt LLMs and addresses all the above challenges: (1) RoAd is remarkably parameter-efficient, delivering optimal performance on GLUE, eight commonsense reasoning tasks and four arithmetic reasoning tasks with $<0.1\\%$ trainable parameters; (2) RoAd facilitates the efficient serving of requests requiring different adapters within a batch, with an overhead comparable to element-wise multiplication instead of batch matrix multiplication; (3) RoAd enhances LLM's interpretability through integration within a framework of distributed interchange intervention, demonstrated via composition experiments.", "authors": [ "Christof Monz", "Baohao Liao" ], "published": "2024-08-28", "conference": null, "conference_url_abs": null, "conference_url_pdf": null, "proceeding": null }, { "id": "3p-llm-probabilistic-path-planning-using", "arxiv_id": "2403.18778", "nips_id": null, "url_abs": "https://arxiv.org/abs/2403.18778v1", "url_pdf": "https://arxiv.org/pdf/2403.18778v1.pdf", "title": "3P-LLM: Probabilistic Path Planning using Large Language Model for Autonomous Robot Navigation", "abstract": "Much worldly semantic knowledge can be encoded in large language models (LLMs). Such information could be of great use to robots that want to carry out high-level, temporally extended commands stated in natural language. However, the lack of real-world experience that language models have is a key limitation that makes it challenging to use them for decision-making inside a particular embodiment. This research assesses the feasibility of using LLM (GPT-3.5-turbo chatbot by OpenAI) for robotic path planning. The shortcomings of conventional approaches to managing complex environments and developing trustworthy plans for shifting environmental conditions serve as the driving force behind the research. Due to the sophisticated natural language processing abilities of LLM, the capacity to provide effective and adaptive path-planning algorithms in real-time, great accuracy, and few-shot learning capabilities, GPT-3.5-turbo is well suited for path planning in robotics. In numerous simulated scenarios, the research compares the performance of GPT-3.5-turbo with that of state-of-the-art path planners like Rapidly Exploring Random Tree (RRT) and A*. We observed that GPT-3.5-turbo is able to provide real-time path planning feedback to the robot and outperforms its counterparts. This paper establishes the foundation for LLM-powered path planning for robotic systems.", "authors": [], "published": "2024-03-27", "conference": null, "conference_url_abs": null, "conference_url_pdf": null, "proceeding": null }, { "id": "3ur-llm-an-end-to-end-multimodal-large", "arxiv_id": "2501.07819", "nips_id": null, "url_abs": "https://arxiv.org/abs/2501.07819v1", "url_pdf": "https://arxiv.org/pdf/2501.07819v1.pdf", "title": "3UR-LLM: An End-to-End Multimodal Large Language Model for 3D Scene Understanding", "abstract": "Multi-modal Large Language Models (MLLMs) exhibit impressive capabilities in 2D tasks, yet encounter challenges in discerning the spatial positions, interrelations, and causal logic in scenes when transitioning from 2D to 3D representations. We find that the limitations mainly lie in: i) the high annotation cost restricting the scale-up of volumes of 3D scene data, and ii) the lack of a straightforward and effective way to perceive 3D information which results in prolonged training durations and complicates the streamlined framework. To this end, we develop pipeline based on open-source 2D MLLMs and LLMs to generate high-quality 3D-text pairs and construct 3DS-160K , to enhance the pre-training process. Leveraging this high-quality pre-training data, we introduce the 3UR-LLM model, an end-to-end 3D MLLM designed for precise interpretation of 3D scenes, showcasing exceptional capability in navigating the complexities of the physical world. 3UR-LLM directly receives 3D point cloud as input and project 3D features fused with text instructions into a manageable set of tokens. Considering the computation burden derived from these hybrid tokens, we design a 3D compressor module to cohesively compress the 3D spatial cues and textual narrative. 3UR-LLM achieves promising performance with respect to the previous SOTAs, for instance, 3UR-LLM exceeds its counterparts by 7.1\\% CIDEr on ScanQA, while utilizing fewer training resources. The code and model weights for 3UR-LLM and the 3DS-160K benchmark are available at 3UR-LLM.", "authors": [ "Huchuan Lu", "Lu Zhang", "Jiawen Zhu", "Yunzhi Zhuge", "Haomiao Xiong" ], "published": "2025-01-14", "conference": null, "conference_url_abs": null, "conference_url_pdf": null, "proceeding": null }, { "id": "4bit-quantization-in-vector-embedding-for-rag", "arxiv_id": "2501.10534", "nips_id": null, "url_abs": "https://arxiv.org/abs/2501.10534v1", "url_pdf": "https://arxiv.org/pdf/2501.10534v1.pdf", "title": "4bit-Quantization in Vector-Embedding for RAG", "abstract": "Retrieval-augmented generation (RAG) is a promising technique that has shown great potential in addressing some of the limitations of large language models (LLMs). LLMs have two major limitations: they can contain outdated information due to their training data, and they can generate factually inaccurate responses, a phenomenon known as hallucinations. RAG aims to mitigate these issues by leveraging a database of relevant documents, which are stored as embedding vectors in a high-dimensional space. However, one of the challenges of using high-dimensional embeddings is that they require a significant amount of memory to store. This can be a major issue, especially when dealing with large databases of documents. To alleviate this problem, we propose the use of 4-bit quantization to store the embedding vectors. This involves reducing the precision of the vectors from 32-bit floating-point numbers to 4-bit integers, which can significantly reduce the memory requirements. Our approach has several benefits. Firstly, it significantly reduces the memory storage requirements of the high-dimensional vector database, making it more feasible to deploy RAG systems in resource-constrained environments. Secondly, it speeds up the searching process, as the reduced precision of the vectors allows for faster computation. Our code is available at https://github.com/taeheej/4bit-Quantization-in-Vector-Embedding-for-RAG", "authors": [ "Taehee Jeong" ], "published": "2025-01-17", "conference": null, "conference_url_abs": null, "conference_url_pdf": null, "proceeding": null }, { "id": "4m-massively-multimodal-masked-modeling-1", "arxiv_id": "2312.06647", "nips_id": null, "url_abs": "https://arxiv.org/abs/2312.06647v1", "url_pdf": "https://arxiv.org/pdf/2312.06647v1.pdf", "title": "4M: Massively Multimodal Masked Modeling", "abstract": "Current machine learning models for vision are often highly specialized and limited to a single modality and task. In contrast, recent large language models exhibit a wide range of capabilities, hinting at a possibility for similarly versatile models in computer vision. In this paper, we take a step in this direction and propose a multimodal training scheme called 4M. It consists of training a single unified Transformer encoder-decoder using a masked modeling objective across a wide range of input/output modalities - including text, images, geometric, and semantic modalities, as well as neural network feature maps. 4M achieves scalability by unifying the representation space of all modalities through mapping them into discrete tokens and performing multimodal masked modeling on a small randomized subset of tokens. 4M leads to models that exhibit several key capabilities: (1) they can perform a diverse set of vision tasks out of the box, (2) they excel when fine-tuned for unseen downstream tasks or new input modalities, and (3) they can function as a generative model that can be conditioned on arbitrary modalities, enabling a wide variety of expressive multimodal editing capabilities with remarkable flexibility. Through experimental analyses, we demonstrate the potential of 4M for training versatile and scalable foundation models for vision tasks, setting the stage for further exploration in multimodal learning for vision and other domains.", "authors": [ "Amir Zamir", "Afshin Dehghan", "Mingfei Gao", "Teresa Yeo", "Oğuzhan Fatih Kar", "Roman Bachmann", "David Mizrahi" ], "published": "2023-12-11", "conference": "4m-massively-multimodal-masked-modeling", "conference_url_abs": "https://openreview.net/forum?id=TegmlsD8oQ", "conference_url_pdf": "https://openreview.net/pdf?id=TegmlsD8oQ", "proceeding": "neurips-2023-11" }, { "id": "2408-03094", "arxiv_id": "2408.03094", "nips_id": null, "url_abs": "https://arxiv.org/abs/2408.03094v1", "url_pdf": "https://arxiv.org/pdf/2408.03094v1.pdf", "title": "500xCompressor: Generalized Prompt Compression for Large Language Models", "abstract": "Prompt compression is crucial for enhancing inference speed, reducing costs, and improving user experience. However, current methods face challenges such as low compression ratios and potential data leakage during evaluation. To address these issues, we propose 500xCompressor, a method that compresses extensive natural language contexts into a minimum of one single special token. The 500xCompressor introduces approximately 0.3% additional parameters and achieves compression ratios ranging from 6x to 480x. It is designed to compress any text, answer various types of questions, and could be utilized by the original large language model (LLM) without requiring fine-tuning. Initially, 500xCompressor was pretrained on the Arxiv Corpus, followed by fine-tuning on the ArxivQA dataset, and subsequently evaluated on strictly unseen and classical question answering (QA) datasets. The results demonstrate that the LLM retained 62.26-72.89% of its capabilities compared to using non-compressed prompts. This study also shows that not all the compressed tokens are equally utilized and that K V values have significant advantages over embeddings in preserving information at high compression ratios. The highly compressive nature of natural language prompts, even for fine-grained complex information, suggests promising potential for future applications and further research into developing a new LLM language.", "authors": [ "Nigel Collier", "Yixuan Su", "Zongqian Li" ], "published": "2024-08-06", "conference": null, "conference_url_abs": null, "conference_url_pdf": null, "proceeding": null }, { "id": "50-shades-of-deceptive-patterns-a-unified", "arxiv_id": "2501.13351", "nips_id": null, "url_abs": "https://arxiv.org/abs/2501.13351v3", "url_pdf": "https://arxiv.org/pdf/2501.13351v3.pdf", "title": "50 Shades of Deceptive Patterns: A Unified Taxonomy, Multimodal Detection, and Security Implications", "abstract": "Deceptive patterns (DPs) are user interface designs deliberately crafted to manipulate users into unintended decisions, often by exploiting cognitive biases for the benefit of companies or services. While numerous studies have explored ways to identify these deceptive patterns, many existing solutions require significant human intervention and struggle to keep pace with the evolving nature of deceptive designs. To address these challenges, we expanded the deceptive pattern taxonomy from security and privacy perspectives, refining its categories and scope. We created a comprehensive dataset of deceptive patterns by integrating existing small-scale datasets with new samples, resulting in 6,725 images and 10,421 DP instances from mobile apps and websites. We then developed DPGuard, a novel automatic tool leveraging commercial multimodal large language models (MLLMs) for deceptive pattern detection. Experimental results show that DPGuard outperforms state-of-the-art methods. Finally, we conducted an extensive empirical evaluation on 2,000 popular mobile apps and websites, revealing that 23.61% of mobile screenshots and 47.27% of website screenshots feature at least one deceptive pattern instance. Through four unexplored case studies that inform security implications, we highlight the critical importance of the unified taxonomy in addressing the growing challenges of Internet deception.", "authors": [], "published": "2025-01-23", "conference": null, "conference_url_abs": null, "conference_url_pdf": null, "proceeding": null }, { "id": "52b-to-1t-lessons-learned-via-tele-flm-series", "arxiv_id": "2407.02783", "nips_id": null, "url_abs": "https://arxiv.org/abs/2407.02783v1", "url_pdf": "https://arxiv.org/pdf/2407.02783v1.pdf", "title": "52B to 1T: Lessons Learned via Tele-FLM Series", "abstract": "Large Language Models (LLMs) represent a significant stride toward Artificial General Intelligence. As scaling laws underscore the potential of increasing model sizes, the academic community has intensified its investigations into LLMs with capacities exceeding 50 billion parameters. This technical report builds on our prior work with Tele-FLM (also known as FLM-2), a publicly available 52-billion-parameter model. We delve into two primary areas: we first discuss our observation of Supervised Fine-tuning (SFT) on Tele-FLM-52B, which supports the \"less is more\" approach for SFT data construction; second, we demonstrate our experiments and analyses on the best practices for progressively growing a model from 52 billion to 102 billion, and subsequently to 1 trillion parameters. We will open-source a 1T model checkpoint, namely Tele-FLM-1T, to advance further training and research.", "authors": [ "Tiejun Huang", "Xuelong Li", "Zhongyuan Wang", "Zhongjiang He", "Yequan Wang", "Aixin Sun", "Bo Zhao", "Zheng Zhang", "Yongxiang Li", "Shuangyong Song", "Yuyao Huang", "Xin Wang", "Yu Zhao", "Zihan Wang", "Xinzhang Liu", "Chao Wang", "Xuezhi Fang", "Xin Jiang", "Yiqun Yao", "Xiang Li" ], "published": "2024-07-03", "conference": null, "conference_url_abs": null, "conference_url_pdf": null, "proceeding": null }, { "id": "5w1h-extraction-with-large-language-models", "arxiv_id": "2405.16150", "nips_id": null, "url_abs": "https://arxiv.org/abs/2405.16150v1", "url_pdf": "https://arxiv.org/pdf/2405.16150v1.pdf", "title": "5W1H Extraction With Large Language Models", "abstract": "The extraction of essential news elements through the 5W1H framework (\\textit{What}, \\textit{When}, \\textit{Where}, \\textit{Why}, \\textit{Who}, and \\textit{How}) is critical for event extraction and text summarization. The advent of Large language models (LLMs) such as ChatGPT presents an opportunity to address language-related tasks through simple prompts without fine-tuning models with much time. While ChatGPT has encountered challenges in processing longer news texts and analyzing specific attributes in context, especially answering questions about \\textit{What}, \\textit{Why}, and \\textit{How}. The effectiveness of extraction tasks is notably dependent on high-quality human-annotated datasets. However, the absence of such datasets for the 5W1H extraction increases the difficulty of fine-tuning strategies based on open-source LLMs. To address these limitations, first, we annotate a high-quality 5W1H dataset based on four typical news corpora (\\textit{CNN/DailyMail}, \\textit{XSum}, \\textit{NYT}, \\textit{RA-MDS}); second, we design several strategies from zero-shot/few-shot prompting to efficient fine-tuning to conduct 5W1H aspects extraction from the original news documents. The experimental results demonstrate that the performance of the fine-tuned models on our labelled dataset is superior to the performance of ChatGPT. Furthermore, we also explore the domain adaptation capability by testing the source-domain (e.g. NYT) models on the target domain corpus (e.g. CNN/DailyMail) for the task of 5W1H extraction.", "authors": [ "Piji Li", "Feiyan Zhai", "Yangsong Lan", "Yang Cao" ], "published": "2024-05-25", "conference": null, "conference_url_abs": null, "conference_url_pdf": null, "proceeding": null }, { "id": "empirical-insights-on-fine-tuning-large", "arxiv_id": "2409.15825", "nips_id": null, "url_abs": "https://arxiv.org/abs/2409.15825v2", "url_pdf": "https://arxiv.org/pdf/2409.15825v2.pdf", "title": "60 Data Points are Sufficient to Fine-Tune LLMs for Question-Answering", "abstract": "Large language models (LLMs) encode extensive world knowledge through pre-training on massive datasets, which can then be fine-tuned for the question-answering (QA) task. However, effective strategies for fine-tuning LLMs for the QA task remain largely unexplored. To address this gap, we categorize supervised fine-tuning (SFT) data based on the extent of knowledge memorized by the pretrained LLMs and conduct a series of empirical analyses. Our experiments, involving four LLMs from three different model families, focus on three key factors: the amount of data required for SFT, the impact of different SFT datasets on model performance, and how data requirements vary across LLMs. The results show that as few as 60 data points during the SFT stage can activate the knowledge encoded during pre-training, enabling LLMs to perform the QA task. Additionally, SFT with data of varying memory levels has a significant impact on LLM performance, with the optimal dataset differing based on the specific model being fine-tuned. Future research will delve deeper into the mechanisms underlying these phenomena.", "authors": [ "Jianping Fan", "Zhongchao shi", "Peng Wang", "Xuanjing Huang", "Tao Gui", "Qi Zhang", "Yuming Yang", "Junjie Ye" ], "published": "2024-09-24", "conference": null, "conference_url_abs": null, "conference_url_pdf": null, "proceeding": null }, { "id": "6g-comprehensive-intelligence-network", "arxiv_id": "2404.18373", "nips_id": null, "url_abs": "https://arxiv.org/abs/2404.18373v3", "url_pdf": "https://arxiv.org/pdf/2404.18373v3.pdf", "title": "6G comprehensive intelligence: network operations and optimization based on Large Language Models", "abstract": "The sixth generation mobile communication standard (6G) can promote the development of Industrial Internet and Internet of Things (IoT). To achieve comprehensive intelligent development of the network and provide customers with higher quality personalized services. This paper proposes a network performance optimization and intelligent operation network architecture based on Large Language Model (LLM), aiming to build a comprehensive intelligent 6G network system. The Large Language Model, with more parameters and stronger learning ability, can more accurately capture patterns and features in data, which can achieve more accurate content output and high intelligence and provide strong support for related research such as network data security, privacy protection, and health assessment. This paper also presents the design framework of a network health assessment system based on LLM and focuses on its potential application value, through the case of network health management system, it is fully demonstrated that the 6G intelligent network system based on LLM has important practical significance for the comprehensive realization of intelligence.", "authors": [], "published": "2024-04-29", "conference": null, "conference_url_abs": null, "conference_url_pdf": null, "proceeding": null }, { "id": "70b-parameter-large-language-models-in", "arxiv_id": "2406.14882", "nips_id": null, "url_abs": "https://arxiv.org/abs/2406.14882v1", "url_pdf": "https://arxiv.org/pdf/2406.14882v1.pdf", "title": "70B-parameter large language models in Japanese medical question-answering", "abstract": "Since the rise of large language models (LLMs), the domain adaptation has been one of the hot topics in various domains. Many medical LLMs trained with English medical dataset have made public recently. However, Japanese LLMs in medical domain still lack its research. Here we utilize multiple 70B-parameter LLMs for the first time and show that instruction tuning using Japanese medical question-answering dataset significantly improves the ability of Japanese LLMs to solve Japanese medical license exams, surpassing 50\\% in accuracy. In particular, the Japanese-centric models exhibit a more significant leap in improvement through instruction tuning compared to their English-centric counterparts. This underscores the importance of continual pretraining and the adjustment of the tokenizer in our local language. We also examine two slightly different prompt formats, resulting in non-negligible performance improvement.", "authors": [ "Satoshi Kodera", "Risa Kishikawa", "Issey Sukeda" ], "published": "2024-06-21", "conference": null, "conference_url_abs": null, "conference_url_pdf": null, "proceeding": null }, { "id": "a-2-nav-action-aware-zero-shot-robot", "arxiv_id": "2308.07997", "nips_id": null, "url_abs": "https://arxiv.org/abs/2308.07997v1", "url_pdf": "https://arxiv.org/pdf/2308.07997v1.pdf", "title": "$A^2$Nav: Action-Aware Zero-Shot Robot Navigation by Exploiting Vision-and-Language Ability of Foundation Models", "abstract": "We study the task of zero-shot vision-and-language navigation (ZS-VLN), a practical yet challenging problem in which an agent learns to navigate following a path described by language instructions without requiring any path-instruction annotation data. Normally, the instructions have complex grammatical structures and often contain various action descriptions (e.g., \"proceed beyond\", \"depart from\"). How to correctly understand and execute these action demands is a critical problem, and the absence of annotated data makes it even more challenging. Note that a well-educated human being can easily understand path instructions without the need for any special training. In this paper, we propose an action-aware zero-shot VLN method ($A^2$Nav) by exploiting the vision-and-language ability of foundation models. Specifically, the proposed method consists of an instruction parser and an action-aware navigation policy. The instruction parser utilizes the advanced reasoning ability of large language models (e.g., GPT-3) to decompose complex navigation instructions into a sequence of action-specific object navigation sub-tasks. Each sub-task requires the agent to localize the object and navigate to a specific goal position according to the associated action demand. To accomplish these sub-tasks, an action-aware navigation policy is learned from freely collected action-specific datasets that reveal distinct characteristics of each action demand. We use the learned navigation policy for executing sub-tasks sequentially to follow the navigation instruction. Extensive experiments show $A^2$Nav achieves promising ZS-VLN performance and even surpasses the supervised learning methods on R2R-Habitat and RxR-Habitat datasets.", "authors": [ "Chuang Gan", "Mingkui Tan", "Gaowen Liu", "Thomas H. Li", "Runhao Zeng", "Hongyan Zhi", "Xinyu Sun", "Peihao Chen" ], "published": "2023-08-15", "conference": null, "conference_url_abs": null, "conference_url_pdf": null, "proceeding": null }, { "id": "a2sf-accumulative-attention-scoring-with", "arxiv_id": "2407.20485", "nips_id": null, "url_abs": "https://arxiv.org/abs/2407.20485v2", "url_pdf": "https://arxiv.org/pdf/2407.20485v2.pdf", "title": "A2SF: Accumulative Attention Scoring with Forgetting Factor for Token Pruning in Transformer Decoder", "abstract": "Recently, large language models (LLM) based on transformers are facing memory bottleneck issues due to KV cache, especially in long sequence handling. Previous researches proposed KV cache compression techniques that identify insignificant tokens based on Accumulative Attention Scores and removes their items from KV cache, noting that only few tokens play an important role in attention operations. However, we have observed that the existing Accumulative Attention Score is not suitable for the transformer decoder structure. In the decoder model, the number of times the Attention Score accumulates varies depending on the order of token appearance due to the effect of masking, causing an uneven comparison between tokens. To solve this, we propose Accumulative Attention Score with Forgetting Factor (A2SF) technique, which introduces a Forgetting Factor in the Attention Score accumulation process. A2SF applies a penalty to the past Attention Score generated from old tokens by repeatedly multiplying the Forgetting Factor to the Attention Score over time. Therefore, older tokens receive a larger penalty, providing fairness among different ages of tokens. Through the fair comparison among tokens, we can more effectively select important tokens. We have verified the accuracy improvement through A2SF in the OPT and LLaMA models and A2SF improves the accuracy of LLaMA 2 by up to 7.8% and 5.1% on 1-shot and 0-shot.", "authors": [ "Dongkun Shin", "Hyun-rae Jo" ], "published": "2024-07-30", "conference": null, "conference_url_abs": null, "conference_url_pdf": null, "proceeding": null }, { "id": "a3-android-agent-arena-for-mobile-gui-agents", "arxiv_id": "2501.01149", "nips_id": null, "url_abs": "https://arxiv.org/abs/2501.01149v1", "url_pdf": "https://arxiv.org/pdf/2501.01149v1.pdf", "title": "A3: Android Agent Arena for Mobile GUI Agents", "abstract": "AI agents have become increasingly prevalent in recent years, driven by significant advancements in the field of large language models (LLMs). Mobile GUI agents, a subset of AI agents, are designed to autonomously perform tasks on mobile devices. While numerous studies have introduced agents, datasets, and benchmarks to advance mobile GUI agent research, many existing datasets focus on static frame evaluations and fail to provide a comprehensive platform for assessing performance on real-world, in-the-wild tasks. To address this gap, we present Android Agent Arena (A3), a novel evaluation platform. Unlike existing in-the-wild systems, A3 offers: (1) meaningful and practical tasks, such as real-time online information retrieval and operational instructions; (2) a larger, more flexible action space, enabling compatibility with agents trained on any dataset; and (3) automated business-level LLM-based evaluation process. A3 includes 21 widely used general third-party apps and 201 tasks representative of common user scenarios, providing a robust foundation for evaluating mobile GUI agents in real-world situations and a new autonomous evaluation process for less human labor and coding expertise. The project is available at \\url{https://yuxiangchai.github.io/Android-Agent-Arena/}.", "authors": [ "Hongsheng Li", "Siyuan Huang", "Shuai Ren", "Guozhi Wang", "Liang Liu", "Jiayu Zhang", "Hanhao Li", "Yuxiang Chai" ], "published": "2025-01-02", "conference": null, "conference_url_abs": null, "conference_url_pdf": null, "proceeding": null }, { "id": "aaai-workshop-on-ai-planning-for-cyber", "arxiv_id": "2410.07245", "nips_id": null, "url_abs": "https://arxiv.org/abs/2410.07245v1", "url_pdf": "https://arxiv.org/pdf/2410.07245v1.pdf", "title": "AAAI Workshop on AI Planning for Cyber-Physical Systems -- CAIPI24", "abstract": "The workshop 'AI-based Planning for Cyber-Physical Systems', which took place on February 26, 2024, as part of the 38th Annual AAAI Conference on Artificial Intelligence in Vancouver, Canada, brought together researchers to discuss recent advances in AI planning methods for Cyber-Physical Systems (CPS). CPS pose a major challenge due to their complexity and data-intensive nature, which often exceeds the capabilities of traditional planning algorithms. The workshop highlighted new approaches such as neuro-symbolic architectures, large language models (LLMs), deep reinforcement learning and advances in symbolic planning. These techniques are promising when it comes to managing the complexity of CPS and have potential for real-world applications.", "authors": [ "Niklas Widulle", "René Heesch", "Jonas Ehrhardt", "Alexander Diedrich", "Gautam Biswas", "Oliver Niggemann" ], "published": "2024-10-08", "conference": null, "conference_url_abs": null, "conference_url_pdf": null, "proceeding": null }, { "id": "aaar-1-0-assessing-ai-s-potential-to-assist", "arxiv_id": "2410.22394", "nips_id": null, "url_abs": "https://arxiv.org/abs/2410.22394v1", "url_pdf": "https://arxiv.org/pdf/2410.22394v1.pdf", "title": "AAAR-1.0: Assessing AI's Potential to Assist Research", "abstract": "Numerous studies have assessed the proficiency of AI systems, particularly large language models (LLMs), in facilitating everyday tasks such as email writing, question answering, and creative content generation. However, researchers face unique challenges and opportunities in leveraging LLMs for their own work, such as brainstorming research ideas, designing experiments, and writing or reviewing papers. In this study, we introduce AAAR-1.0, a benchmark dataset designed to evaluate LLM performance in three fundamental, expertise-intensive research tasks: (i) EquationInference, assessing the correctness of equations based on the contextual information in paper submissions; (ii) ExperimentDesign, designing experiments to validate research ideas and solutions; (iii) PaperWeakness, identifying weaknesses in paper submissions; and (iv) REVIEWCRITIQUE, identifying each segment in human reviews is deficient or not. AAAR-1.0 differs from prior benchmarks in two key ways: first, it is explicitly research-oriented, with tasks requiring deep domain expertise; second, it is researcher-oriented, mirroring the primary activities that researchers engage in on a daily basis. An evaluation of both open-source and proprietary LLMs reveals their potential as well as limitations in conducting sophisticated research tasks. We will keep iterating AAAR-1.0 to new versions.", "authors": [ "Wenpeng Yin", "Lifu Huang", "Congying Xia", "Kai Zhang", "Xi Li", "Wenchao Ma", "Zhuoyang Zou", "Hongchao Fang", "Jihyun Janice Ahn", "Yusen Zhang", "Yuxuan Sun", "Jian Xie", "Xiaoxin Lu", "Ryo Kamoi", "Jiangshu Du", "Sijia Wang", "Hanzi Xu", "Renze Lou" ], "published": "2024-10-29", "conference": null, "conference_url_abs": null, "conference_url_pdf": null, "proceeding": null }, { "id": "aad-llm-adaptive-anomaly-detection-using", "arxiv_id": "2411.00914", "nips_id": null, "url_abs": "https://arxiv.org/abs/2411.00914v1", "url_pdf": "https://arxiv.org/pdf/2411.00914v1.pdf", "title": "AAD-LLM: Adaptive Anomaly Detection Using Large Language Models", "abstract": "For data-constrained, complex and dynamic industrial environments, there is a critical need for transferable and multimodal methodologies to enhance anomaly detection and therefore, prevent costs associated with system failures. Typically, traditional PdM approaches are not transferable or multimodal. This work examines the use of Large Language Models (LLMs) for anomaly detection in complex and dynamic manufacturing systems. The research aims to improve the transferability of anomaly detection models by leveraging Large Language Models (LLMs) and seeks to validate the enhanced effectiveness of the proposed approach in data-sparse industrial applications. The research also seeks to enable more collaborative decision-making between the model and plant operators by allowing for the enriching of input series data with semantics. Additionally, the research aims to address the issue of concept drift in dynamic industrial settings by integrating an adaptability mechanism. The literature review examines the latest developments in LLM time series tasks alongside associated adaptive anomaly detection methods to establish a robust theoretical framework for the proposed architecture. This paper presents a novel model framework (AAD-LLM) that doesn't require any training or finetuning on the dataset it is applied to and is multimodal. Results suggest that anomaly detection can be converted into a \"language\" task to deliver effective, context-aware detection in data-constrained industrial applications. This work, therefore, contributes significantly to advancements in anomaly detection methodologies.", "authors": [ "Joshua Church", "Thomas Arnold", "Joseph Jaboure", "Maria Seale", "Shahram Rahimi", "Sudip Mittal", "Logan Cummins", "Andrew Thompson", "Alexander Sommers", "Alicia Russell-Gilbert" ], "published": "2024-11-01", "conference": null, "conference_url_abs": null, "conference_url_pdf": null, "proceeding": null }, { "id": "aakos-aspect-adaptive-knowledge-based-opinion", "arxiv_id": "2306.05537", "nips_id": null, "url_abs": "https://arxiv.org/abs/2306.05537v1", "url_pdf": "https://arxiv.org/pdf/2306.05537v1.pdf", "title": "AaKOS: Aspect-adaptive Knowledge-based Opinion Summarization", "abstract": "The rapid growth of information on the Internet has led to an overwhelming amount of opinions and comments on various activities, products, and services. This makes it difficult and time-consuming for users to process all the available information when making decisions. Text summarization, a Natural Language Processing (NLP) task, has been widely explored to help users quickly retrieve relevant information by generating short and salient content from long or multiple documents. Recent advances in pre-trained language models, such as ChatGPT, have demonstrated the potential of Large Language Models (LLMs) in text generation. However, LLMs require massive amounts of data and resources and are challenging to implement as offline applications. Furthermore, existing text summarization approaches often lack the ``adaptive\" nature required to capture diverse aspects in opinion summarization, which is particularly detrimental to users with specific requirements or preferences. In this paper, we propose an Aspect-adaptive Knowledge-based Opinion Summarization model for product reviews, which effectively captures the adaptive nature required for opinion summarization. The model generates aspect-oriented summaries given a set of reviews for a particular product, efficiently providing users with useful information on specific aspects they are interested in, ensuring the generated summaries are more personalized and informative. Extensive experiments have been conducted using real-world datasets to evaluate the proposed model. The results demonstrate that our model outperforms state-of-the-art approaches and is adaptive and efficient in generating summaries that focus on particular aspects, enabling users to make well-informed decisions and catering to their diverse interests and preferences.", "authors": [ "Quan Bai", "Edmund M-K. Lai", "Weihua Li", "Guan Wang" ], "published": "2023-05-26", "conference": null, "conference_url_abs": null, "conference_url_pdf": null, "proceeding": null }, { "id": "aalap-ai-assistant-for-legal-paralegal", "arxiv_id": "2402.01758", "nips_id": null, "url_abs": "https://arxiv.org/abs/2402.01758v1", "url_pdf": "https://arxiv.org/pdf/2402.01758v1.pdf", "title": "Aalap: AI Assistant for Legal & Paralegal Functions in India", "abstract": "Using proprietary Large Language Models on legal tasks poses challenges due to data privacy issues, domain data heterogeneity, domain knowledge sophistication, and domain objectives uniqueness. We created Aalalp, a fine-tuned Mistral 7B model on instructions data related to specific Indian legal tasks. The performance of Aalap is better than gpt-3.5-turbo in 31\\% of our test data and obtains an equivalent score in 34\\% of the test data as evaluated by GPT4. Training Aalap mainly focuses on teaching legal reasoning rather than legal recall. Aalap is definitely helpful for the day-to-day activities of lawyers, judges, or anyone working in legal systems.", "authors": [ "Smita Gupta", "Varun Hemachandran", "Saurabh Karn", "Atreyo Banerjee", "Prathamesh Kalamkar", "Aman Tiwari" ], "published": "2024-01-30", "conference": null, "conference_url_abs": null, "conference_url_pdf": null, "proceeding": null }, { "id": "aart-ai-assisted-red-teaming-with-diverse", "arxiv_id": "2311.08592", "nips_id": null, "url_abs": "https://arxiv.org/abs/2311.08592v2", "url_pdf": "https://arxiv.org/pdf/2311.08592v2.pdf", "title": "AART: AI-Assisted Red-Teaming with Diverse Data Generation for New LLM-powered Applications", "abstract": "Adversarial testing of large language models (LLMs) is crucial for their safe and responsible deployment. We introduce a novel approach for automated generation of adversarial evaluation datasets to test the safety of LLM generations on new downstream applications. We call it AI-assisted Red-Teaming (AART) - an automated alternative to current manual red-teaming efforts. AART offers a data generation and augmentation pipeline of reusable and customizable recipes that reduce human effort significantly and enable integration of adversarial testing earlier in new product development. AART generates evaluation datasets with high diversity of content characteristics critical for effective adversarial testing (e.g. sensitive and harmful concepts, specific to a wide range of cultural and geographic regions and application scenarios). The data generation is steered by AI-assisted recipes to define, scope and prioritize diversity within the application context. This feeds into a structured LLM-generation process that scales up evaluation priorities. Compared to some state-of-the-art tools, AART shows promising results in terms of concept coverage and data quality.", "authors": [ "Preethi Lahoti", "Lora Aroyo", "Kevin Robinson", "Bhaktipriya Radharapu" ], "published": "2023-11-14", "conference": null, "conference_url_abs": null, "conference_url_pdf": null, "proceeding": null }, { "id": "a-b-a-general-generator-reader-framework-for", "arxiv_id": "2406.03963", "nips_id": null, "url_abs": "https://arxiv.org/abs/2406.03963v1", "url_pdf": "https://arxiv.org/pdf/2406.03963v1.pdf", "title": "A + B: A General Generator-Reader Framework for Optimizing LLMs to Unleash Synergy Potential", "abstract": "Retrieval-Augmented Generation (RAG) is an effective solution to supplement necessary knowledge to large language models (LLMs). Targeting its bottleneck of retriever performance, \"generate-then-read\" pipeline is proposed to replace the retrieval stage with generation from the LLM itself. Although promising, this research direction is underexplored and still cannot work in the scenario when source knowledge is given. In this paper, we formalize a general \"A + B\" framework with varying combinations of foundation models and types for systematic investigation. We explore the efficacy of the base and chat versions of LLMs and found their different functionalities suitable for generator A and reader B, respectively. Their combinations consistently outperform single models, especially in complex scenarios. Furthermore, we extend the application of the \"A + B\" framework to scenarios involving source documents through continuous learning, enabling the direct integration of external knowledge into LLMs. This approach not only facilitates effective acquisition of new knowledge but also addresses the challenges of safety and helpfulness post-adaptation. The paper underscores the versatility of the \"A + B\" framework, demonstrating its potential to enhance the practical application of LLMs across various domains.", "authors": [ "Pengyuan Zhou", "Yong Liao", "Yuyue Zhao", "Bo wang", "Jiahao Ying", "Yixin Cao", "Wei Tang" ], "published": "2024-06-06", "conference": null, "conference_url_abs": null, "conference_url_pdf": null, "proceeding": null }, { "id": "a-baseline-analysis-of-reward-models-ability", "arxiv_id": "2311.14743", "nips_id": null, "url_abs": "https://arxiv.org/abs/2311.14743v7", "url_pdf": "https://arxiv.org/pdf/2311.14743v7.pdf", "title": "A Baseline Analysis of Reward Models' Ability To Accurately Analyze Foundation Models Under Distribution Shift", "abstract": "Foundation models, specifically Large Language Models (LLMs), have lately gained wide-spread attention and adoption. Reinforcement Learning with Human Feedback (RLHF) involves training a reward model to capture desired behaviors, which is then used to align LLM's. These reward models are additionally used at inference-time to estimate LLM responses' adherence to those desired behaviors. However, there is little work measuring how robust these reward models are to distribution shifts. In this work, we evaluate how reward model performance - measured via accuracy and calibration (i.e. alignment between accuracy and confidence) - is affected by distribution shift. We show novel calibration patterns and accuracy drops due to OOD prompts and responses, and that the reward model is more sensitive to shifts in responses than prompts. Additionally, we adapt an OOD detection technique commonly used in classification to the reward model setting to detect these distribution shifts in prompts and responses.", "authors": [ "Benjamin Pikus", "Anthony Chen", "Sean Hendryx", "Will LeVine" ], "published": "2023-11-21", "conference": null, "conference_url_abs": null, "conference_url_pdf": null, "proceeding": null }, { "id": "a-bayesian-approach-to-data-point-selection", "arxiv_id": "2411.03768", "nips_id": null, "url_abs": "https://arxiv.org/abs/2411.03768v1", "url_pdf": "https://arxiv.org/pdf/2411.03768v1.pdf", "title": "A Bayesian Approach to Data Point Selection", "abstract": "Data point selection (DPS) is becoming a critical topic in deep learning due to the ease of acquiring uncurated training data compared to the difficulty of obtaining curated or processed data. Existing approaches to DPS are predominantly based on a bi-level optimisation (BLO) formulation, which is demanding in terms of memory and computation, and exhibits some theoretical defects regarding minibatches. Thus, we propose a novel Bayesian approach to DPS. We view the DPS problem as posterior inference in a novel Bayesian model where the posterior distributions of the instance-wise weights and the main neural network parameters are inferred under a reasonable prior and likelihood model. We employ stochastic gradient Langevin MCMC sampling to learn the main network and instance-wise weights jointly, ensuring convergence even with minibatches. Our update equation is comparable to the widely used SGD and much more efficient than existing BLO-based methods. Through controlled experiments in both the vision and language domains, we present the proof-of-concept. Additionally, we demonstrate that our method scales effectively to large language models and facilitates automated per-task optimization for instruction fine-tuning datasets.", "authors": [ "Timothy Hospedales", "Brais Martinez", "Royson Lee", "Minyoung Kim", "Xinnuo Xu" ], "published": "2024-11-06", "conference": null, "conference_url_abs": null, "conference_url_pdf": null, "proceeding": null } ] }{ "count": 24708, "next": "