Return a paginated listing of all papers.

GET /api/v1/papers/?q=Large%20Language%20Models
HTTP 200 OK
Allow: GET, HEAD, OPTIONS
Content-Type: application/json
Vary: Accept

{
    "count": 23921,
    "next": "https://paperswithcode.com/api/v1/papers/?page=2&q=Large+Language+Models",
    "previous": null,
    "results": [
        {
            "id": "n-gram-counts-and-language-models-from-the",
            "arxiv_id": null,
            "nips_id": null,
            "url_abs": "https://aclanthology.org/L14-1074",
            "url_pdf": "https://aclanthology.org/L14-1074.pdf",
            "title": "N-gram Counts and Language Models from the Common Crawl",
            "abstract": "We contribute 5-gram counts and language models trained on the Common Crawl corpus, a collection over 9 billion web pages. This release improves upon the Google n-gram counts in two key ways: the inclusion of low-count entries and deduplication to reduce boilerplate. By preserving singletons, we were able to use Kneser-Ney smoothing to build large language models. This paper describes how the corpus was processed with emphasis on the problems that arise in working with data at this scale. Our unpruned Kneser-Ney English {\\$}5{\\$}-gram language model, built on 975 billion deduplicated tokens, contains over 500 billion unique n-grams. We show gains of 0.5-1.4 BLEU by using large language models to translate into various languages.",
            "authors": [
                "Christian Buck",
                "Bas van Ooyen",
                "Kenneth Heafield"
            ],
            "published": "2014-05-01",
            "conference": null,
            "conference_url_abs": null,
            "conference_url_pdf": null,
            "proceeding": "lrec-2014-5"
        },
        {
            "id": "using-monolingual-data-in-neural-machine-1",
            "arxiv_id": "1903.11437",
            "nips_id": null,
            "url_abs": "http://arxiv.org/abs/1903.11437v1",
            "url_pdf": "http://arxiv.org/pdf/1903.11437v1.pdf",
            "title": "Using Monolingual Data in Neural Machine Translation: a Systematic Study",
            "abstract": "Neural Machine Translation (MT) has radically changed the way systems are\ndeveloped. A major difference with the previous generation (Phrase-Based MT) is\nthe way monolingual target data, which often abounds, is used in these two\nparadigms. While Phrase-Based MT can seamlessly integrate very large language\nmodels trained on billions of sentences, the best option for Neural MT\ndevelopers seems to be the generation of artificial parallel data through\n\\textsl{back-translation} - a technique that fails to fully take advantage of\nexisting datasets. In this paper, we conduct a systematic study of\nback-translation, comparing alternative uses of monolingual data, as well as\nmultiple data generation procedures. Our findings confirm that back-translation\nis very effective and give new explanations as to why this is the case. We also\nintroduce new data simulation techniques that are almost as effective, yet much\ncheaper to implement.",
            "authors": [
                "François Yvon",
                "Franck Burlot"
            ],
            "published": "2019-03-27",
            "conference": "using-monolingual-data-in-neural-machine",
            "conference_url_abs": "https://aclanthology.org/W18-6315",
            "conference_url_pdf": "https://aclanthology.org/W18-6315.pdf",
            "proceeding": "ws-2018-10"
        },
        {
            "id": "baracks-wife-hillary-using-knowledge-graphs",
            "arxiv_id": "1906.07241",
            "nips_id": null,
            "url_abs": "https://arxiv.org/abs/1906.07241v2",
            "url_pdf": "https://arxiv.org/pdf/1906.07241v2.pdf",
            "title": "Barack's Wife Hillary: Using Knowledge-Graphs for Fact-Aware Language Modeling",
            "abstract": "Modeling human language requires the ability to not only generate fluent text but also encode factual knowledge. However, traditional language models are only capable of remembering facts seen at training time, and often have difficulty recalling them. To address this, we introduce the knowledge graph language model (KGLM), a neural language model with mechanisms for selecting and copying facts from a knowledge graph that are relevant to the context. These mechanisms enable the model to render information it has never seen before, as well as generate out-of-vocabulary tokens. We also introduce the Linked WikiText-2 dataset, a corpus of annotated text aligned to the Wikidata knowledge graph whose contents (roughly) match the popular WikiText-2 benchmark. In experiments, we demonstrate that the KGLM achieves significantly better performance than a strong baseline language model. We additionally compare different language model's ability to complete sentences requiring factual knowledge, showing that the KGLM outperforms even very large language models in generating facts.",
            "authors": [
                "Sameer Singh",
                "Matt Gardner",
                "Nelson F. Liu",
                "Matthew E. Peters",
                "Robert L. Logan IV"
            ],
            "published": "2019-06-17",
            "conference": null,
            "conference_url_abs": null,
            "conference_url_pdf": null,
            "proceeding": null
        },
        {
            "id": "baracks-wife-hillary-using-knowledge-graphs-1",
            "arxiv_id": null,
            "nips_id": null,
            "url_abs": "https://aclanthology.org/P19-1598",
            "url_pdf": "https://aclanthology.org/P19-1598.pdf",
            "title": "Barack's Wife Hillary: Using Knowledge Graphs for Fact-Aware Language Modeling",
            "abstract": "Modeling human language requires the ability to not only generate fluent text but also encode factual knowledge. However, traditional language models are only capable of remembering facts seen at training time, and often have difficulty recalling them. To address this, we introduce the knowledge graph language model (KGLM), a neural language model with mechanisms for selecting and copying facts from a knowledge graph that are relevant to the context. These mechanisms enable the model to render information it has never seen before, as well as generate out-of-vocabulary tokens. We also introduce the Linked WikiText-2 dataset, a corpus of annotated text aligned to the Wikidata knowledge graph whose contents (roughly) match the popular WikiText-2 benchmark. In experiments, we demonstrate that the KGLM achieves significantly better performance than a strong baseline language model. We additionally compare different language model{'}s ability to complete sentences requiring factual knowledge, showing that the KGLM outperforms even very large language models in generating facts.",
            "authors": [
                "Robert Logan",
                "Nelson F. Liu",
                "Sameer Singh",
                "Matthew E. Peters",
                "Matt Gardner"
            ],
            "published": "2019-07-01",
            "conference": null,
            "conference_url_abs": null,
            "conference_url_pdf": null,
            "proceeding": "acl-2019-7"
        },
        {
            "id": "release-strategies-and-the-social-impacts-of",
            "arxiv_id": "1908.09203",
            "nips_id": null,
            "url_abs": "https://arxiv.org/abs/1908.09203v2",
            "url_pdf": "https://arxiv.org/pdf/1908.09203v2.pdf",
            "title": "Release Strategies and the Social Impacts of Language Models",
            "abstract": "Large language models have a range of beneficial uses: they can assist in prose, poetry, and programming; analyze dataset biases; and more. However, their flexibility and generative capabilities also raise misuse concerns. This report discusses OpenAI's work related to the release of its GPT-2 language model. It discusses staged release, which allows time between model releases to conduct risk and benefit analyses as model sizes increased. It also discusses ongoing partnership-based research and provides recommendations for better coordination and responsible publication in AI.",
            "authors": [
                "Miles McCain",
                "Ariel Herbert-Voss",
                "Jack Clark",
                "Jong Wook Kim",
                "Jeff Wu",
                "Jason Blazakis",
                "Alex Newhouse",
                "Amanda Askell",
                "Alec Radford",
                "Sarah Kreps",
                "Miles Brundage",
                "Kris McGuffie",
                "Jasmine Wang",
                "Irene Solaiman",
                "Gretchen Krueger"
            ],
            "published": "2019-08-24",
            "conference": null,
            "conference_url_abs": null,
            "conference_url_pdf": null,
            "proceeding": null
        },
        {
            "id": "megatron-lm-training-multi-billion-parameter",
            "arxiv_id": "1909.08053",
            "nips_id": null,
            "url_abs": "https://arxiv.org/abs/1909.08053v4",
            "url_pdf": "https://arxiv.org/pdf/1909.08053v4.pdf",
            "title": "Megatron-LM: Training Multi-Billion Parameter Language Models Using Model Parallelism",
            "abstract": "Recent work in language modeling demonstrates that training large transformer models advances the state of the art in Natural Language Processing applications. However, very large models can be quite difficult to train due to memory constraints. In this work, we present our techniques for training very large transformer models and implement a simple, efficient intra-layer model parallel approach that enables training transformer models with billions of parameters. Our approach does not require a new compiler or library changes, is orthogonal and complimentary to pipeline model parallelism, and can be fully implemented with the insertion of a few communication operations in native PyTorch. We illustrate this approach by converging transformer based models up to 8.3 billion parameters using 512 GPUs. We sustain 15.1 PetaFLOPs across the entire application with 76% scaling efficiency when compared to a strong single GPU baseline that sustains 39 TeraFLOPs, which is 30% of peak FLOPs. To demonstrate that large language models can further advance the state of the art (SOTA), we train an 8.3 billion parameter transformer language model similar to GPT-2 and a 3.9 billion parameter model similar to BERT. We show that careful attention to the placement of layer normalization in BERT-like models is critical to achieving increased performance as the model size grows. Using the GPT-2 model we achieve SOTA results on the WikiText103 (10.8 compared to SOTA perplexity of 15.8) and LAMBADA (66.5% compared to SOTA accuracy of 63.2%) datasets. Our BERT model achieves SOTA results on the RACE dataset (90.9% compared to SOTA accuracy of 89.4%).",
            "authors": [
                "Patrick Legresley",
                "Mostofa Patwary",
                "Jared Casper",
                "Mohammad Shoeybi",
                "Raul Puri",
                "Bryan Catanzaro"
            ],
            "published": "2019-09-17",
            "conference": null,
            "conference_url_abs": null,
            "conference_url_pdf": null,
            "proceeding": null
        },
        {
            "id": "structured-pruning-of-large-language-models",
            "arxiv_id": "1910.04732",
            "nips_id": null,
            "url_abs": "https://arxiv.org/abs/1910.04732v2",
            "url_pdf": "https://arxiv.org/pdf/1910.04732v2.pdf",
            "title": "Structured Pruning of Large Language Models",
            "abstract": "Large language models have recently achieved state of the art performance across a wide variety of natural language tasks. Meanwhile, the size of these models and their latency have significantly increased, which makes their usage costly, and raises an interesting question: do language models need to be large? We study this question through the lens of model compression. We present a generic, structured pruning approach by parameterizing each weight matrix using its low-rank factorization, and adaptively removing rank-1 components during training. On language modeling tasks, our structured approach outperforms other unstructured and block-structured pruning baselines at various compression levels, while achieving significant speedups during both training and inference. We also demonstrate that our method can be applied to pruning adaptive word embeddings in large language models, and to pruning the BERT model on several downstream fine-tuning classification benchmarks.",
            "authors": [
                "Tao Lei",
                "Ziheng Wang",
                "Jeremy Wohlwend"
            ],
            "published": "2019-10-10",
            "conference": null,
            "conference_url_abs": "https://aclanthology.org/2020.emnlp-main.496",
            "conference_url_pdf": "https://aclanthology.org/2020.emnlp-main.496.pdf",
            "proceeding": "emnlp-2020-11"
        },
        {
            "id": "exbert-a-visual-analysis-tool-to-explore",
            "arxiv_id": "1910.05276",
            "nips_id": null,
            "url_abs": "https://arxiv.org/abs/1910.05276v1",
            "url_pdf": "https://arxiv.org/pdf/1910.05276v1.pdf",
            "title": "exBERT: A Visual Analysis Tool to Explore Learned Representations in Transformers Models",
            "abstract": "Large language models can produce powerful contextual representations that lead to improvements across many NLP tasks. Since these models are typically guided by a sequence of learned self attention mechanisms and may comprise undesired inductive biases, it is paramount to be able to explore what the attention has learned. While static analyses of these models lead to targeted insights, interactive tools are more dynamic and can help humans better gain an intuition for the model-internal reasoning process. We present exBERT, an interactive tool named after the popular BERT language model, that provides insights into the meaning of the contextual representations by matching a human-specified input to similar contexts in a large annotated dataset. By aggregating the annotations of the matching similar contexts, exBERT helps intuitively explain what each attention-head has learned.",
            "authors": [
                "Hendrik Strobelt",
                "Benjamin Hoover",
                "Sebastian Gehrmann"
            ],
            "published": "2019-10-11",
            "conference": null,
            "conference_url_abs": null,
            "conference_url_pdf": null,
            "proceeding": null
        },
        {
            "id": "feature-detection-and-attenuation-in",
            "arxiv_id": "1910.05862",
            "nips_id": null,
            "url_abs": "https://arxiv.org/abs/1910.05862v4",
            "url_pdf": "https://arxiv.org/pdf/1910.05862v4.pdf",
            "title": "Constrained Non-Affine Alignment of Embeddings",
            "abstract": "Embeddings are one of the fundamental building blocks for data analysis tasks. Embeddings are already essential tools for large language models and image analysis, and their use is being extended to many other research domains. The generation of these distributed representations is often a data- and computation-expensive process; yet the holistic analysis and adjustment of them after they have been created is still a developing area. In this paper, we first propose a very general quantitatively measure for the presence of features in the embedding data based on if it can be learned. We then devise a method to remove or alleviate undesired features in the embedding while retaining the essential structure of the data. We use a Domain Adversarial Network (DAN) to generate a non-affine transformation, but we add constraints to ensure the essential structure of the embedding is preserved. Our empirical results demonstrate that the proposed algorithm significantly outperforms the state-of-art unsupervised algorithm on several data sets, including novel applications from the industry.",
            "authors": [
                "Chin-Chia Michael Yeh",
                "Jeff M. Phillips",
                "Wei zhang",
                "Feifei Li",
                "Bendre Mangesh",
                "Das Mahashweta",
                "Zhongfang Zhuang",
                "Yan Zheng",
                "Yanqing Peng",
                "Yuwei Wang"
            ],
            "published": "2019-10-13",
            "conference": null,
            "conference_url_abs": null,
            "conference_url_pdf": null,
            "proceeding": null
        },
        {
            "id": "transfer-learning-from-transformers-to-fake",
            "arxiv_id": "1910.14353",
            "nips_id": null,
            "url_abs": "https://arxiv.org/abs/1910.14353v1",
            "url_pdf": "https://arxiv.org/pdf/1910.14353v1.pdf",
            "title": "Transfer Learning from Transformers to Fake News Challenge Stance Detection (FNC-1) Task",
            "abstract": "In this paper, we report improved results of the Fake News Challenge Stage 1 (FNC-1) stance detection task. This gain in performance is due to the generalization power of large language models based on Transformer architecture, invented, trained and publicly released over the last two years. Specifically (1) we improved the FNC-1 best performing model adding BERT sentence embedding of input sequences as a model feature, (2) we fine-tuned BERT, XLNet, and RoBERTa transformers on FNC-1 extended dataset and obtained state-of-the-art results on FNC-1 task.",
            "authors": [
                "Valeriya Slovikovskaya"
            ],
            "published": "2019-10-31",
            "conference": "transfer-learning-from-transformers-to-fake-1",
            "conference_url_abs": "https://aclanthology.org/2020.lrec-1.152",
            "conference_url_pdf": "https://aclanthology.org/2020.lrec-1.152.pdf",
            "proceeding": "lrec-2020-5"
        },
        {
            "id": "do-nuclear-submarines-have-nuclear-captains-a",
            "arxiv_id": null,
            "nips_id": null,
            "url_abs": "https://aclanthology.org/D19-1625",
            "url_pdf": "https://aclanthology.org/D19-1625.pdf",
            "title": "Do Nuclear Submarines Have Nuclear Captains? A Challenge Dataset for Commonsense Reasoning over Adjectives and Objects",
            "abstract": "How do adjectives project from a noun to its parts? If a motorcycle is red, are its wheels red? Is a nuclear submarine{'}s captain nuclear? These questions are easy for humans to judge using our commonsense understanding of the world, but are difficult for computers. To attack this challenge, we crowdsource a set of human judgments that answer the English-language question {``}Given a whole described by an adjective, does the adjective also describe a given part?{''} We build strong baselines for this task with a classification approach. Our findings indicate that, despite the recent successes of large language models on tasks aimed to assess commonsense knowledge, these models do not greatly outperform simple word-level models based on pre-trained word embeddings. This provides evidence that the amount of commonsense knowledge encoded in these language models does not extend far beyond that already baked into the word embeddings. Our dataset will serve as a useful testbed for future research in commonsense reasoning, especially as it relates to adjectives and objects",
            "authors": [
                "Nanyun Peng",
                "James Mullenbach",
                "Jonathan Gordon",
                "Jonathan May"
            ],
            "published": "2019-11-01",
            "conference": null,
            "conference_url_abs": null,
            "conference_url_pdf": null,
            "proceeding": "ijcnlp-2019-11"
        },
        {
            "id": "paraphrasing-with-large-language-models-1",
            "arxiv_id": "1911.09661",
            "nips_id": null,
            "url_abs": "https://arxiv.org/abs/1911.09661v1",
            "url_pdf": "https://arxiv.org/pdf/1911.09661v1.pdf",
            "title": "Paraphrasing with Large Language Models",
            "abstract": "Recently, large language models such as GPT-2 have shown themselves to be extremely adept at text generation and have also been able to achieve high-quality results in many downstream NLP tasks such as text classification, sentiment analysis and question answering with the aid of fine-tuning. We present a useful technique for using a large language model to perform the task of paraphrasing on a variety of texts and subjects. Our approach is demonstrated to be capable of generating paraphrases not only at a sentence level but also for longer spans of text such as paragraphs without needing to break the text into smaller chunks.",
            "authors": [
                "Sam Witteveen",
                "Martin Andrews"
            ],
            "published": "2019-11-21",
            "conference": "paraphrasing-with-large-language-models",
            "conference_url_abs": "https://aclanthology.org/D19-5623",
            "conference_url_pdf": "https://aclanthology.org/D19-5623.pdf",
            "proceeding": "ws-2019-11"
        },
        {
            "id": "waldorf-wasteless-language-model-distillation",
            "arxiv_id": "1912.06638",
            "nips_id": null,
            "url_abs": "https://arxiv.org/abs/1912.06638v2",
            "url_pdf": "https://arxiv.org/pdf/1912.06638v2.pdf",
            "title": "WaLDORf: Wasteless Language-model Distillation On Reading-comprehension",
            "abstract": "Transformer based Very Large Language Models (VLLMs) like BERT, XLNet and RoBERTa, have recently shown tremendous performance on a large variety of Natural Language Understanding (NLU) tasks. However, due to their size, these VLLMs are extremely resource intensive and cumbersome to deploy at production time. Several recent publications have looked into various ways to distil knowledge from a transformer based VLLM (most commonly BERT-Base) into a smaller model which can run much faster at inference time. Here, we propose a novel set of techniques which together produce a task-specific hybrid convolutional and transformer model, WaLDORf, that achieves state-of-the-art inference speed while still being more accurate than previous distilled models.",
            "authors": [
                "Hans-Martin Will",
                "Pai-Hung Chen",
                "James Yi Tian",
                "Alexander P. Kreuzer"
            ],
            "published": "2019-12-13",
            "conference": null,
            "conference_url_abs": null,
            "conference_url_pdf": null,
            "proceeding": null
        },
        {
            "id": "on-the-comparability-of-pre-trained-language",
            "arxiv_id": "2001.00781",
            "nips_id": null,
            "url_abs": "https://arxiv.org/abs/2001.00781v1",
            "url_pdf": "https://arxiv.org/pdf/2001.00781v1.pdf",
            "title": "On the comparability of Pre-trained Language Models",
            "abstract": "Recent developments in unsupervised representation learning have successfully established the concept of transfer learning in NLP. Mainly three forces are driving the improvements in this area of research: More elaborated architectures are making better use of contextual information. Instead of simply plugging in static pre-trained representations, these are learned based on surrounding context in end-to-end trainable models with more intelligently designed language modelling objectives. Along with this, larger corpora are used as resources for pre-training large language models in a self-supervised fashion which are afterwards fine-tuned on supervised tasks. Advances in parallel computing as well as in cloud computing, made it possible to train these models with growing capacities in the same or even in shorter time than previously established models. These three developments agglomerate in new state-of-the-art (SOTA) results being revealed in a higher and higher frequency. It is not always obvious where these improvements originate from, as it is not possible to completely disentangle the contributions of the three driving forces. We set ourselves to providing a clear and concise overview on several large pre-trained language models, which achieved SOTA results in the last two years, with respect to their use of new architectures and resources. We want to clarify for the reader where the differences between the models are and we furthermore attempt to gain some insight into the single contributions of lexical/computational improvements as well as of architectural changes. We explicitly do not intend to quantify these contributions, but rather see our work as an overview in order to identify potential starting points for benchmark comparisons. Furthermore, we tentatively want to point at potential possibilities for improvement in the field of open-sourcing and reproducible research.",
            "authors": [
                "Christian Heumann",
                "Matthias Aßenmacher"
            ],
            "published": "2020-01-03",
            "conference": null,
            "conference_url_abs": null,
            "conference_url_pdf": null,
            "proceeding": null
        },
        {
            "id": "training-question-answering-models-from",
            "arxiv_id": "2002.09599",
            "nips_id": null,
            "url_abs": "https://arxiv.org/abs/2002.09599v1",
            "url_pdf": "https://arxiv.org/pdf/2002.09599v1.pdf",
            "title": "Training Question Answering Models From Synthetic Data",
            "abstract": "Question and answer generation is a data augmentation method that aims to improve question answering (QA) models given the limited amount of human labeled data. However, a considerable gap remains between synthetic and human-generated question-answer pairs. This work aims to narrow this gap by taking advantage of large language models and explores several factors such as model size, quality of pretrained models, scale of data synthesized, and algorithmic choices. On the SQuAD1.1 question answering task, we achieve higher accuracy using solely synthetic questions and answers than when using the SQuAD1.1 training set questions alone. Removing access to real Wikipedia data, we synthesize questions and answers from a synthetic corpus generated by an 8.3 billion parameter GPT-2 model. With no access to human supervision and only access to other models, we are able to train state of the art question answering networks on entirely model-generated data that achieve 88.4 Exact Match (EM) and 93.9 F1 score on the SQuAD1.1 dev set. We further apply our methodology to SQuAD2.0 and show a 2.8 absolute gain on EM score compared to prior work using synthetic data.",
            "authors": [
                "Mostofa Patwary",
                "Mohammad Shoeybi",
                "Raul Puri",
                "Ryan Spring",
                "Bryan Catanzaro"
            ],
            "published": "2020-02-22",
            "conference": null,
            "conference_url_abs": "https://aclanthology.org/2020.emnlp-main.468",
            "conference_url_pdf": "https://aclanthology.org/2020.emnlp-main.468.pdf",
            "proceeding": "emnlp-2020-11"
        },
        {
            "id": "the-curious-case-of-developmental-bertology",
            "arxiv_id": "2007.03774",
            "nips_id": null,
            "url_abs": "https://arxiv.org/abs/2007.03774v1",
            "url_pdf": "https://arxiv.org/pdf/2007.03774v1.pdf",
            "title": "The curious case of developmental BERTology: On sparsity, transfer learning, generalization and the brain",
            "abstract": "In this essay, we explore a point of intersection between deep learning and neuroscience, through the lens of large language models, transfer learning and network compression. Just like perceptual and cognitive neurophysiology has inspired effective deep neural network architectures which in turn make a useful model for understanding the brain, here we explore how biological neural development might inspire efficient and robust optimization procedures which in turn serve as a useful model for the maturation and aging of the brain.",
            "authors": [
                "Xin Wang"
            ],
            "published": "2020-07-07",
            "conference": null,
            "conference_url_abs": null,
            "conference_url_pdf": null,
            "proceeding": null
        },
        {
            "id": "language-models-as-few-shot-learner-for-task",
            "arxiv_id": "2008.06239",
            "nips_id": null,
            "url_abs": "https://arxiv.org/abs/2008.06239v2",
            "url_pdf": "https://arxiv.org/pdf/2008.06239v2.pdf",
            "title": "Language Models as Few-Shot Learner for Task-Oriented Dialogue Systems",
            "abstract": "Task-oriented dialogue systems use four connected modules, namely, Natural Language Understanding (NLU), a Dialogue State Tracking (DST), Dialogue Policy (DP) and Natural Language Generation (NLG). A research challenge is to learn each module with the least amount of samples (i.e., few-shots) given the high cost related to the data collection. The most common and effective technique to solve this problem is transfer learning, where large language models, either pre-trained on text or task-specific data, are fine-tuned on the few samples. These methods require fine-tuning steps and a set of parameters for each task. Differently, language models, such as GPT-2 (Radford et al., 2019) and GPT-3 (Brown et al., 2020), allow few-shot learning by priming the model with few examples. In this paper, we evaluate the priming few-shot ability of language models in the NLU, DST, DP and NLG tasks. Importantly, we highlight the current limitations of this approach, and we discuss the possible implication for future work.",
            "authors": [
                "Zhaojiang Lin",
                "Pascale Fung",
                "Zihan Liu",
                "Andrea Madotto"
            ],
            "published": "2020-08-14",
            "conference": null,
            "conference_url_abs": null,
            "conference_url_pdf": null,
            "proceeding": null
        },
        {
            "id": "controlling-dialogue-generation-with-semantic",
            "arxiv_id": "2008.09075",
            "nips_id": null,
            "url_abs": "https://arxiv.org/abs/2008.09075v2",
            "url_pdf": "https://arxiv.org/pdf/2008.09075v2.pdf",
            "title": "Controlling Dialogue Generation with Semantic Exemplars",
            "abstract": "Dialogue systems pretrained with large language models generate locally coherent responses, but lack the fine-grained control over responses necessary to achieve specific goals. A promising method to control response generation is exemplar-based generation, in which models edit exemplar responses that are retrieved from training data, or hand-written to strategically address discourse-level goals, to fit new dialogue contexts. But, current exemplar-based approaches often excessively copy words from the exemplar responses, leading to incoherent replies. We present an Exemplar-based Dialogue Generation model, EDGE, that uses the semantic frames present in exemplar responses to guide generation. We show that controlling dialogue generation based on the semantic frames of exemplars, rather than words in the exemplar itself, improves the coherence of generated responses, while preserving semantic meaning and conversation goals present in exemplar responses.",
            "authors": [
                "Prakhar Gupta",
                "Amy Pavel",
                "Yulia Tsvetkov",
                "Jeffrey P. Bigham"
            ],
            "published": "2020-08-20",
            "conference": null,
            "conference_url_abs": "https://aclanthology.org/2021.naacl-main.240",
            "conference_url_pdf": "https://aclanthology.org/2021.naacl-main.240.pdf",
            "proceeding": "naacl-2021-4"
        },
        {
            "id": "the-adapter-bot-all-in-one-controllable",
            "arxiv_id": "2008.12579",
            "nips_id": null,
            "url_abs": "https://arxiv.org/abs/2008.12579v2",
            "url_pdf": "https://arxiv.org/pdf/2008.12579v2.pdf",
            "title": "The Adapter-Bot: All-In-One Controllable Conversational Model",
            "abstract": "Considerable progress has been made towards conversational models that generate coherent and fluent responses by training large language models on large dialogue datasets. These models have little or no control of the generated responses and miss two important features: continuous dialogue skills integration and seamlessly leveraging diverse knowledge sources. In this paper, we propose the Adapter-Bot, a dialogue model that uses a fixed backbone conversational model such as DialGPT (Zhang et al., 2019) and triggers on-demand dialogue skills (e.g., emphatic response, weather information, movie recommendation) via different adapters (Houlsby et al., 2019). Each adapter can be trained independently, thus allowing a continual integration of skills without retraining the entire model. Depending on the skills, the model is able to process multiple knowledge types, such as text, tables, and graphs, in a seamless manner. The dialogue skills can be triggered automatically via a dialogue manager, or manually, thus allowing high-level control of the generated responses. At the current stage, we have implemented 12 response styles (e.g., positive, negative etc.), 8 goal-oriented skills (e.g. weather information, movie recommendation, etc.), and personalized and emphatic responses. We evaluate our model using automatic evaluation by comparing it with existing state-of-the-art conversational models, and we have released an interactive system at adapter.bot.ust.hk.",
            "authors": [
                "Pascale Fung",
                "Yejin Bang",
                "Andrea Madotto",
                "Zhaojiang Lin"
            ],
            "published": "2020-08-28",
            "conference": null,
            "conference_url_abs": null,
            "conference_url_pdf": null,
            "proceeding": null
        },
        {
            "id": "gedi-generative-discriminator-guided-sequence",
            "arxiv_id": "2009.06367",
            "nips_id": null,
            "url_abs": "https://arxiv.org/abs/2009.06367v2",
            "url_pdf": "https://arxiv.org/pdf/2009.06367v2.pdf",
            "title": "GeDi: Generative Discriminator Guided Sequence Generation",
            "abstract": "While large-scale language models (LMs) are able to imitate the distribution of natural language well enough to generate realistic text, it is difficult to control which regions of the distribution they generate. This is especially problematic because datasets used for training large LMs usually contain significant toxicity, hate, bias, and negativity. We propose GeDi as an efficient method for using smaller LMs as generative discriminators to guide generation from large LMs to make them safer and more controllable. GeDi guides generation at each step by computing classification probabilities for all possible next tokens via Bayes rule by normalizing over two class-conditional distributions; one conditioned on the desired attribute, or control code, and another conditioned on the undesired attribute, or anti control code. We find that GeDi gives stronger controllability than the state of the art method while also achieving generation speeds more than 30 times faster. Additionally, training GeDi on only four topics allows us to controllably generate new topics zero-shot from just a keyword, unlocking a new capability that previous controllable generation methods do not have. Lastly, we show that GeDi can make GPT-2 (1.5B parameters) significantly less toxic without sacrificing linguistic quality, making it by far the most practical existing method for detoxifying large language models while maintaining a fast generation speed.",
            "authors": [
                "Nazneen Fatema Rajani",
                "Richard Socher",
                "Akhilesh Deepak Gotmare",
                "Shafiq Joty",
                "Bryan McCann",
                "Ben Krause",
                "Nitish Shirish Keskar"
            ],
            "published": "2020-09-14",
            "conference": null,
            "conference_url_abs": "https://aclanthology.org/2021.findings-emnlp.424",
            "conference_url_pdf": "https://aclanthology.org/2021.findings-emnlp.424.pdf",
            "proceeding": "findings-emnlp-2021-11"
        },
        {
            "id": "content-planning-for-neural-story-generation",
            "arxiv_id": "2009.09870",
            "nips_id": null,
            "url_abs": "https://arxiv.org/abs/2009.09870v2",
            "url_pdf": "https://arxiv.org/pdf/2009.09870v2.pdf",
            "title": "Content Planning for Neural Story Generation with Aristotelian Rescoring",
            "abstract": "Long-form narrative text generated from large language models manages a fluent impersonation of human writing, but only at the local sentence level, and lacks structure or global cohesion. We posit that many of the problems of story generation can be addressed via high-quality content planning, and present a system that focuses on how to learn good plot structures to guide story generation. We utilize a plot-generation language model along with an ensemble of rescoring models that each implement an aspect of good story-writing as detailed in Aristotle's Poetics. We find that stories written with our more principled plot-structure are both more relevant to a given prompt and higher quality than baselines that do not content plan, or that plan in an unprincipled way.",
            "authors": [
                "Seraphina Goldfarb-Tarrant",
                "Nanyun Peng",
                "Ralph Weischedel",
                "Tuhin Chakrabarty"
            ],
            "published": "2020-09-21",
            "conference": null,
            "conference_url_abs": "https://aclanthology.org/2020.emnlp-main.351",
            "conference_url_pdf": "https://aclanthology.org/2020.emnlp-main.351.pdf",
            "proceeding": "emnlp-2020-11"
        },
        {
            "id": "megatron-cntrl-controllable-story-generation",
            "arxiv_id": "2010.00840",
            "nips_id": null,
            "url_abs": "https://arxiv.org/abs/2010.00840v1",
            "url_pdf": "https://arxiv.org/pdf/2010.00840v1.pdf",
            "title": "MEGATRON-CNTRL: Controllable Story Generation with External Knowledge Using Large-Scale Language Models",
            "abstract": "Existing pre-trained large language models have shown unparalleled generative capabilities. However, they are not controllable. In this paper, we propose MEGATRON-CNTRL, a novel framework that uses large-scale language models and adds control to text generation by incorporating an external knowledge base. Our framework consists of a keyword predictor, a knowledge retriever, a contextual knowledge ranker, and a conditional text generator. As we do not have access to ground-truth supervision for the knowledge ranker, we make use of weak supervision from sentence embedding. The empirical results show that our model generates more fluent, consistent, and coherent stories with less repetition and higher diversity compared to prior work on the ROC story dataset. We showcase the controllability of our model by replacing the keywords used to generate stories and re-running the generation process. Human evaluation results show that 77.5% of these stories are successfully controlled by the new keywords. Furthermore, by scaling our model from 124 million to 8.3 billion parameters we demonstrate that larger models improve both the quality of generation (from 74.5% to 93.0% for consistency) and controllability (from 77.5% to 91.5%).",
            "authors": [
                "Bryan Catanzaro",
                "Anima Anandkumar",
                "Pascale Fung",
                "Raul Puri",
                "Mohammad Shoeybi",
                "Mostofa Patwary",
                "Peng Xu"
            ],
            "published": "2020-10-02",
            "conference": null,
            "conference_url_abs": "https://aclanthology.org/2020.emnlp-main.226",
            "conference_url_pdf": "https://aclanthology.org/2020.emnlp-main.226.pdf",
            "proceeding": "emnlp-2020-11"
        },
        {
            "id": "an-empirical-investigation-towards-efficient",
            "arxiv_id": "2010.00784",
            "nips_id": null,
            "url_abs": "https://arxiv.org/abs/2010.00784v1",
            "url_pdf": "https://arxiv.org/pdf/2010.00784v1.pdf",
            "title": "An Empirical Investigation Towards Efficient Multi-Domain Language Model Pre-training",
            "abstract": "Pre-training large language models has become a standard in the natural language processing community. Such models are pre-trained on generic data (e.g. BookCorpus and English Wikipedia) and often fine-tuned on tasks in the same domain. However, in order to achieve state-of-the-art performance on out of domain tasks such as clinical named entity recognition and relation extraction, additional in domain pre-training is required. In practice, staged multi-domain pre-training presents performance deterioration in the form of catastrophic forgetting (CF) when evaluated on a generic benchmark such as GLUE. In this paper we conduct an empirical investigation into known methods to mitigate CF. We find that elastic weight consolidation provides best overall scores yielding only a 0.33% drop in performance across seven generic tasks while remaining competitive in bio-medical tasks. Furthermore, we explore gradient and latent clustering based data selection techniques to improve coverage when using elastic weight consolidation and experience replay methods.",
            "authors": [
                "Parminder Bhatia",
                "Qing Sun",
                "Kristjan Arumae"
            ],
            "published": "2020-10-01",
            "conference": null,
            "conference_url_abs": "https://aclanthology.org/2020.emnlp-main.394",
            "conference_url_pdf": "https://aclanthology.org/2020.emnlp-main.394.pdf",
            "proceeding": "emnlp-2020-11"
        },
        {
            "id": "dact-bert-increasing-the-efficiency-and",
            "arxiv_id": null,
            "nips_id": null,
            "url_abs": "https://openreview.net/forum?id=wKfXaxPist",
            "url_pdf": "https://openreview.net/pdf?id=wKfXaxPist",
            "title": "DACT-BERT: Increasing the efficiency and interpretability of BERT by using adaptive computation time.",
            "abstract": "Large-scale pre-trained language models have shown remarkable results in diverse NLP applications. Unfortunately, these performance gains have been accompanied by a significant increase in computation time and model size, stressing the need to develop new or complementary strategies to increase the efficiency and interpretability of current large language models, such as BERT. In this paper we propose DACT-BERT, a differentiable adaptive computation time strategy for BERT language model. DACT-BERT adds an adaptive computation mechanism to the regular processing pipeline of BERT. This mechanism controls the number of transformer blocks that BERT needs to execute at inference time. By doing this, the model makes predictions based on the most appropriate intermediate representations for the task encoded by the pre-trained weights. With respect to previous works, DACT-BERT has the advantage of being fully differentiable and directly integrated to BERT's main processing pipeline. This enables the incorporation of gradient-based transparency mechanisms to improve interpretability. Furthermore, by discarding useless steps, DACT-BERT facilitates the understanding of the underlying process used by BERT to reach an inference. Our experiments demonstrate that our approach is effective in significantly reducing computational complexity without affecting model accuracy. Additionally, they also demonstrate that DACT-BERT helps to improve model interpretability. ",
            "authors": [
                "Alvaro Soto",
                "Vladimir Araujo",
                "Felipe del Rio",
                "Cristobal Eyzaguirre"
            ],
            "published": "2021-01-01",
            "conference": null,
            "conference_url_abs": null,
            "conference_url_pdf": null,
            "proceeding": null
        },
        {
            "id": "modeling-preconditions-in-text-with-a-crowd",
            "arxiv_id": "2010.02429",
            "nips_id": null,
            "url_abs": "https://arxiv.org/abs/2010.02429v3",
            "url_pdf": "https://arxiv.org/pdf/2010.02429v3.pdf",
            "title": "Modeling Preconditions in Text with a Crowd-sourced Dataset",
            "abstract": "Preconditions provide a form of logical connection between events that explains why some events occur together and information that is complementary to the more widely studied relations such as causation, temporal ordering, entailment, and discourse relations. Modeling preconditions in text has been hampered in part due to the lack of large scale labeled data grounded in text. This paper introduces PeKo, a crowd-sourced annotation of preconditions between event pairs in newswire, an order of magnitude larger than prior text annotations. To complement this new corpus, we also introduce two challenge tasks aimed at modeling preconditions: (i) Precondition Identification -- a standard classification task defined over pairs of event mentions, and (ii) Precondition Generation -- a generative task aimed at testing a more general ability to reason about a given event. Evaluation on both tasks shows that modeling preconditions is challenging even for today's large language models (LM). This suggests that precondition knowledge is not easily accessible in LM-derived representations alone. Our generation results show that fine-tuning an LM on PeKo yields better conditional relations than when trained on raw text or temporally-ordered corpora.",
            "authors": [
                "Niranjan Balasubramanian",
                "Nathanael Chambers",
                "Keerthi Kumar Kallur",
                "Anmol Shukla",
                "Gargi Sawhney",
                "Pratyush Singh",
                "Mahnaz Koupaee",
                "Heeyoung Kwon"
            ],
            "published": "2020-10-06",
            "conference": null,
            "conference_url_abs": "https://aclanthology.org/2020.findings-emnlp.340",
            "conference_url_pdf": "https://aclanthology.org/2020.findings-emnlp.340.pdf",
            "proceeding": "findings-of-the-association-for-computational"
        },
        {
            "id": "plug-and-play-conversational-models",
            "arxiv_id": "2010.04344",
            "nips_id": null,
            "url_abs": "https://arxiv.org/abs/2010.04344v1",
            "url_pdf": "https://arxiv.org/pdf/2010.04344v1.pdf",
            "title": "Plug-and-Play Conversational Models",
            "abstract": "There has been considerable progress made towards conversational models that generate coherent and fluent responses; however, this often involves training large language models on large dialogue datasets, such as Reddit. These large conversational models provide little control over the generated responses, and this control is further limited in the absence of annotated conversational datasets for attribute specific generation that can be used for fine-tuning the model. In this paper, we first propose and evaluate plug-and-play methods for controllable response generation, which does not require dialogue specific datasets and does not rely on fine-tuning a large model. While effective, the decoding procedure induces considerable computational overhead, rendering the conversational model unsuitable for interactive usage. To overcome this, we introduce an approach that does not require further computation at decoding time, while also does not require any fine-tuning of a large language model. We demonstrate, through extensive automatic and human evaluation, a high degree of control over the generated conversational responses with regard to multiple desired attributes, while being fluent.",
            "authors": [
                "Pascale Fung",
                "Sumanth Dathathri",
                "Zhaojiang Lin",
                "Etsuko Ishii",
                "Andrea Madotto"
            ],
            "published": "2020-10-09",
            "conference": null,
            "conference_url_abs": "https://aclanthology.org/2020.findings-emnlp.219",
            "conference_url_pdf": "https://aclanthology.org/2020.findings-emnlp.219.pdf",
            "proceeding": "findings-of-the-association-for-computational"
        },
        {
            "id": "extracting-training-data-from-large-language",
            "arxiv_id": "2012.07805",
            "nips_id": null,
            "url_abs": "https://arxiv.org/abs/2012.07805v2",
            "url_pdf": "https://arxiv.org/pdf/2012.07805v2.pdf",
            "title": "Extracting Training Data from Large Language Models",
            "abstract": "It has become common to publish large (billion parameter) language models that have been trained on private datasets. This paper demonstrates that in such settings, an adversary can perform a training data extraction attack to recover individual training examples by querying the language model. We demonstrate our attack on GPT-2, a language model trained on scrapes of the public Internet, and are able to extract hundreds of verbatim text sequences from the model's training data. These extracted examples include (public) personally identifiable information (names, phone numbers, and email addresses), IRC conversations, code, and 128-bit UUIDs. Our attack is possible even though each of the above sequences are included in just one document in the training data. We comprehensively evaluate our extraction attack to understand the factors that contribute to its success. Worryingly, we find that larger models are more vulnerable than smaller models. We conclude by drawing lessons and discussing possible safeguards for training large language models.",
            "authors": [
                "Colin Raffel",
                "Alina Oprea",
                "Ulfar Erlingsson",
                "Dawn Song",
                "Tom Brown",
                "Adam Roberts",
                "Katherine Lee",
                "Ariel Herbert-Voss",
                "Matthew Jagielski",
                "Eric Wallace",
                "Florian Tramer",
                "Nicholas Carlini"
            ],
            "published": "2020-12-14",
            "conference": null,
            "conference_url_abs": null,
            "conference_url_pdf": null,
            "proceeding": null
        },
        {
            "id": "evolution-is-all-you-need-phylogenetic",
            "arxiv_id": "2012.13475",
            "nips_id": null,
            "url_abs": "https://arxiv.org/abs/2012.13475v1",
            "url_pdf": "https://arxiv.org/pdf/2012.13475v1.pdf",
            "title": "Evolution Is All You Need: Phylogenetic Augmentation for Contrastive Learning",
            "abstract": "Self-supervised representation learning of biological sequence embeddings alleviates computational resource constraints on downstream tasks while circumventing expensive experimental label acquisition. However, existing methods mostly borrow directly from large language models designed for NLP, rather than with bioinformatics philosophies in mind. Recently, contrastive mutual information maximization methods have achieved state-of-the-art representations for ImageNet. In this perspective piece, we discuss how viewing evolution as natural sequence augmentation and maximizing information across phylogenetic \"noisy channels\" is a biologically and theoretically desirable objective for pretraining encoders. We first provide a review of current contrastive learning literature, then provide an illustrative example where we show that contrastive learning using evolutionary augmentation can be used as a representation learning objective which maximizes the mutual information between biological sequences and their conserved function, and finally outline rationale for this approach.",
            "authors": [
                "Alan Moses",
                "Alex X. Lu",
                "Amy X. Lu"
            ],
            "published": "2020-12-25",
            "conference": null,
            "conference_url_abs": null,
            "conference_url_pdf": null,
            "proceeding": null
        },
        {
            "id": "understanding-few-shot-commonsense-knowledge",
            "arxiv_id": "2101.00297",
            "nips_id": null,
            "url_abs": "https://arxiv.org/abs/2101.00297v3",
            "url_pdf": "https://arxiv.org/pdf/2101.00297v3.pdf",
            "title": "Analyzing Commonsense Emergence in Few-shot Knowledge Models",
            "abstract": "Recently, commonsense knowledge models - pretrained language models (LM) fine-tuned on knowledge graph (KG) tuples - showed that considerable amounts of commonsense knowledge can be encoded in the parameters of large language models. However, as parallel studies show that LMs are poor hypothesizers of declarative commonsense relationships on their own, it remains unclear whether this knowledge is learned during pretraining or from fine-tuning on KG examples. To investigate this question, we train commonsense knowledge models in few-shot settings to study the emergence of their commonsense representation abilities. Our results show that commonsense knowledge models can rapidly adapt from limited examples, indicating that KG fine-tuning serves to learn an interface to encoded knowledge learned during pretraining. Importantly, our analysis of absolute, angular, and distributional parameter changes during few-shot fine-tuning provides novel insights into how this interface is learned.",
            "authors": [
                "Antoine Bosselut",
                "Yejin Choi",
                "Ximing Lu",
                "Ronan Le Bras",
                "Jeff Da"
            ],
            "published": "2021-01-01",
            "conference": null,
            "conference_url_abs": "https://openreview.net/forum?id=StHCELh9PVE",
            "conference_url_pdf": "https://openreview.net/pdf?id=StHCELh9PVE",
            "proceeding": "akbc-2021-10"
        },
        {
            "id": "persistent-anti-muslim-bias-in-large-language",
            "arxiv_id": "2101.05783",
            "nips_id": null,
            "url_abs": "https://arxiv.org/abs/2101.05783v2",
            "url_pdf": "https://arxiv.org/pdf/2101.05783v2.pdf",
            "title": "Persistent Anti-Muslim Bias in Large Language Models",
            "abstract": "It has been observed that large-scale language models capture undesirable societal biases, e.g. relating to race and gender; yet religious bias has been relatively unexplored. We demonstrate that GPT-3, a state-of-the-art contextual language model, captures persistent Muslim-violence bias. We probe GPT-3 in various ways, including prompt completion, analogical reasoning, and story generation, to understand this anti-Muslim bias, demonstrating that it appears consistently and creatively in different uses of the model and that it is severe even compared to biases about other religious groups. For instance, \"Muslim\" is analogized to \"terrorist\" in 23% of test cases, while \"Jewish\" is mapped to \"money\" in 5% of test cases. We quantify the positive distraction needed to overcome this bias with adversarial text prompts, and find that use of the most positive 6 adjectives reduces violent completions for \"Muslims\" from 66% to 20%, but which is still higher than for other religious groups.",
            "authors": [
                "James Zou",
                "Maheen Farooqi",
                "Abubakar Abid"
            ],
            "published": "2021-01-14",
            "conference": null,
            "conference_url_abs": null,
            "conference_url_pdf": null,
            "proceeding": null
        },
        {
            "id": "distilling-large-language-models-into-tiny",
            "arxiv_id": "2101.08890",
            "nips_id": null,
            "url_abs": "https://arxiv.org/abs/2101.08890v1",
            "url_pdf": "https://arxiv.org/pdf/2101.08890v1.pdf",
            "title": "Distilling Large Language Models into Tiny and Effective Students using pQRNN",
            "abstract": "Large pre-trained multilingual models like mBERT, XLM-R achieve state of the art results on language understanding tasks. However, they are not well suited for latency critical applications on both servers and edge devices. It's important to reduce the memory and compute resources required by these models. To this end, we propose pQRNN, a projection-based embedding-free neural encoder that is tiny and effective for natural language processing tasks. Without pre-training, pQRNNs significantly outperform LSTM models with pre-trained embeddings despite being 140x smaller. With the same number of parameters, they outperform transformer baselines thereby showcasing their parameter efficiency. Additionally, we show that pQRNNs are effective student architectures for distilling large pre-trained language models. We perform careful ablations which study the effect of pQRNN parameters, data augmentation, and distillation settings. On MTOP, a challenging multilingual semantic parsing dataset, pQRNN students achieve 95.9\\% of the performance of an mBERT teacher while being 350x smaller. On mATIS, a popular parsing task, pQRNN students on average are able to get to 97.1\\% of the teacher while again being 350x smaller. Our strong results suggest that our approach is great for latency-sensitive applications while being able to leverage large mBERT-like models.",
            "authors": [
                "Melvin Johnson",
                "Edward Li",
                "Aditya Siddhant",
                "Prabhu Kaliamoorthi"
            ],
            "published": "2021-01-21",
            "conference": null,
            "conference_url_abs": null,
            "conference_url_pdf": null,
            "proceeding": null
        },
        {
            "id": "scaling-federated-learning-for-fine-tuning-of",
            "arxiv_id": "2102.00875",
            "nips_id": null,
            "url_abs": "https://arxiv.org/abs/2102.00875v1",
            "url_pdf": "https://arxiv.org/pdf/2102.00875v1.pdf",
            "title": "Scaling Federated Learning for Fine-tuning of Large Language Models",
            "abstract": "Federated learning (FL) is a promising approach to distributed compute, as well as distributed data, and provides a level of privacy and compliance to legal frameworks. This makes FL attractive for both consumer and healthcare applications. While the area is actively being explored, few studies have examined FL in the context of larger language models and there is a lack of comprehensive reviews of robustness across tasks, architectures, numbers of clients, and other relevant factors. In this paper, we explore the fine-tuning of Transformer-based language models in a federated learning setting. We evaluate three popular BERT-variants of different sizes (BERT, ALBERT, and DistilBERT) on a number of text classification tasks such as sentiment analysis and author identification. We perform an extensive sweep over the number of clients, ranging up to 32, to evaluate the impact of distributed compute on task performance in the federated averaging setting. While our findings suggest that the large sizes of the evaluated models are not generally prohibitive to federated training, we found that the different models handle federated averaging to a varying degree. Most notably, DistilBERT converges significantly slower with larger numbers of clients, and under some circumstances, even collapses to chance level performance. Investigating this issue presents an interesting perspective for future research.",
            "authors": [
                "Olof Mogren",
                "Edvin Listo Zec",
                "Leon René Sütfeld",
                "Matteo Barbieri",
                "Sebastian Callh",
                "Agrin Hilmkil"
            ],
            "published": "2021-02-01",
            "conference": null,
            "conference_url_abs": null,
            "conference_url_pdf": null,
            "proceeding": null
        },
        {
            "id": "understanding-the-capabilities-limitations",
            "arxiv_id": "2102.02503",
            "nips_id": null,
            "url_abs": "https://arxiv.org/abs/2102.02503v1",
            "url_pdf": "https://arxiv.org/pdf/2102.02503v1.pdf",
            "title": "Understanding the Capabilities, Limitations, and Societal Impact of Large Language Models",
            "abstract": "On October 14th, 2020, researchers from OpenAI, the Stanford Institute for Human-Centered Artificial Intelligence, and other universities convened to discuss open research questions surrounding GPT-3, the largest publicly-disclosed dense language model at the time. The meeting took place under Chatham House Rules. Discussants came from a variety of research backgrounds including computer science, linguistics, philosophy, political science, communications, cyber policy, and more. Broadly, the discussion centered around two main questions: 1) What are the technical capabilities and limitations of large language models? 2) What are the societal effects of widespread use of large language models? Here, we provide a detailed summary of the discussion organized by the two themes above.",
            "authors": [
                "Deep Ganguli",
                "Jack Clark",
                "Miles Brundage",
                "Alex Tamkin"
            ],
            "published": "2021-02-04",
            "conference": null,
            "conference_url_abs": null,
            "conference_url_pdf": null,
            "proceeding": null
        },
        {
            "id": "how-true-is-gpt-2-an-empirical-analysis-of",
            "arxiv_id": "2102.04130",
            "nips_id": null,
            "url_abs": "https://arxiv.org/abs/2102.04130v3",
            "url_pdf": "https://arxiv.org/pdf/2102.04130v3.pdf",
            "title": "Bias Out-of-the-Box: An Empirical Analysis of Intersectional Occupational Biases in Popular Generative Language Models",
            "abstract": "The capabilities of natural language models trained on large-scale data have increased immensely over the past few years. Open source libraries such as HuggingFace have made these models easily available and accessible. While prior research has identified biases in large language models, this paper considers biases contained in the most popular versions of these models when applied `out-of-the-box' for downstream tasks. We focus on generative language models as they are well-suited for extracting biases inherited from training data. Specifically, we conduct an in-depth analysis of GPT-2, which is the most downloaded text generation model on HuggingFace, with over half a million downloads per month. We assess biases related to occupational associations for different protected categories by intersecting gender with religion, sexuality, ethnicity, political affiliation, and continental name origin. Using a template-based data collection pipeline, we collect 396K sentence completions made by GPT-2 and find: (i) The machine-predicted jobs are less diverse and more stereotypical for women than for men, especially for intersections; (ii) Intersectional interactions are highly relevant for occupational associations, which we quantify by fitting 262 logistic models; (iii) For most occupations, GPT-2 reflects the skewed gender and ethnicity distribution found in US Labor Bureau data, and even pulls the societally-skewed distribution towards gender parity in cases where its predictions deviate from real labor market observations. This raises the normative question of what language models should learn - whether they should reflect or correct for existing inequalities.",
            "authors": [
                "Yuki M. Asano",
                "Aleksandar Shtedritski",
                "Frederic A. Dreyer",
                "Filippo Volpin",
                "Elias Benussi",
                "Haider Iqbal",
                "Yennie Jun",
                "Hannah Kirk"
            ],
            "published": "2021-02-08",
            "conference": null,
            "conference_url_abs": "http://proceedings.neurips.cc/paper/2021/hash/1531beb762df4029513ebf9295e0d34f-Abstract.html",
            "conference_url_pdf": "http://proceedings.neurips.cc/paper/2021/file/1531beb762df4029513ebf9295e0d34f-Paper.pdf",
            "proceeding": "neurips-2021-12"
        },
        {
            "id": "prompt-programming-for-large-language-models",
            "arxiv_id": "2102.07350",
            "nips_id": null,
            "url_abs": "https://arxiv.org/abs/2102.07350v1",
            "url_pdf": "https://arxiv.org/pdf/2102.07350v1.pdf",
            "title": "Prompt Programming for Large Language Models: Beyond the Few-Shot Paradigm",
            "abstract": "Prevailing methods for mapping large generative language models to supervised tasks may fail to sufficiently probe models' novel capabilities. Using GPT-3 as a case study, we show that 0-shot prompts can significantly outperform few-shot prompts. We suggest that the function of few-shot examples in these cases is better described as locating an already learned task rather than meta-learning. This analysis motivates rethinking the role of prompts in controlling and evaluating powerful language models. In this work, we discuss methods of prompt programming, emphasizing the usefulness of considering prompts through the lens of natural language. We explore techniques for exploiting the capacity of narratives and cultural anchors to encode nuanced intentions and techniques for encouraging deconstruction of a problem into components before producing a verdict. Informed by this more encompassing theory of prompt programming, we also introduce the idea of a metaprompt that seeds the model to generate its own natural language prompts for a range of tasks. Finally, we discuss how these more general methods of interacting with language models can be incorporated into existing and future benchmarks and practical applications.",
            "authors": [
                "Kyle McDonell",
                "Laria Reynolds"
            ],
            "published": "2021-02-15",
            "conference": null,
            "conference_url_abs": null,
            "conference_url_pdf": null,
            "proceeding": null
        },
        {
            "id": "when-attention-meets-fast-recurrence-training",
            "arxiv_id": "2102.12459",
            "nips_id": null,
            "url_abs": "https://arxiv.org/abs/2102.12459v3",
            "url_pdf": "https://arxiv.org/pdf/2102.12459v3.pdf",
            "title": "When Attention Meets Fast Recurrence: Training Language Models with Reduced Compute",
            "abstract": "Large language models have become increasingly difficult to train because of the growing computation time and cost. In this work, we present SRU++, a highly-efficient architecture that combines fast recurrence and attention for sequence modeling. SRU++ exhibits strong modeling capacity and training efficiency. On standard language modeling tasks such as Enwik8, Wiki-103 and Billion Word datasets, our model obtains better bits-per-character and perplexity while using 3x-10x less training cost compared to top-performing Transformer models. For instance, our model achieves a state-of-the-art result on the Enwik8 dataset using 1.6 days of training on an 8-GPU machine. We further demonstrate that SRU++ requires minimal attention for near state-of-the-art performance. Our results suggest jointly leveraging fast recurrence with little attention as a promising direction for accelerating model training and inference.",
            "authors": [
                "Tao Lei"
            ],
            "published": "2021-02-24",
            "conference": null,
            "conference_url_abs": "https://aclanthology.org/2021.emnlp-main.602",
            "conference_url_pdf": "https://aclanthology.org/2021.emnlp-main.602.pdf",
            "proceeding": "emnlp-2021-11"
        },
        {
            "id": "attribute-alignment-controlling-text",
            "arxiv_id": "2103.11070",
            "nips_id": null,
            "url_abs": "https://arxiv.org/abs/2103.11070v2",
            "url_pdf": "https://arxiv.org/pdf/2103.11070v2.pdf",
            "title": "Attribute Alignment: Controlling Text Generation from Pre-trained Language Models",
            "abstract": "Large language models benefit from training with a large amount of unlabeled text, which gives them increasingly fluent and diverse generation capabilities. However, using these models for text generation that takes into account target attributes, such as sentiment polarity or specific topics, remains a challenge. We propose a simple and flexible method for controlling text generation by aligning disentangled attribute representations. In contrast to recent efforts on training a discriminator to perturb the token level distribution for an attribute, we use the same data to learn an alignment function to guide the pre-trained, non-controlled language model to generate texts with the target attribute without changing the original language model parameters. We evaluate our method on sentiment- and topic-controlled generation, and show large performance gains over previous methods while retaining fluency and diversity.",
            "authors": [
                "Kenji Sagae",
                "Zhou Yu",
                "Dian Yu"
            ],
            "published": "2021-03-20",
            "conference": null,
            "conference_url_abs": "https://aclanthology.org/2021.findings-emnlp.194",
            "conference_url_pdf": "https://aclanthology.org/2021.findings-emnlp.194.pdf",
            "proceeding": "findings-emnlp-2021-11"
        },
        {
            "id": "detecting-hate-speech-with-gpt-3",
            "arxiv_id": "2103.12407",
            "nips_id": null,
            "url_abs": "https://arxiv.org/abs/2103.12407v4",
            "url_pdf": "https://arxiv.org/pdf/2103.12407v4.pdf",
            "title": "Detecting Hate Speech with GPT-3",
            "abstract": "Sophisticated language models such as OpenAI's GPT-3 can generate hateful text that targets marginalized groups. Given this capacity, we are interested in whether large language models can be used to identify hate speech and classify text as sexist or racist. We use GPT-3 to identify sexist and racist text passages with zero-, one-, and few-shot learning. We find that with zero- and one-shot learning, GPT-3 can identify sexist or racist text with an average accuracy between 55 per cent and 67 per cent, depending on the category of text and type of learning. With few-shot learning, the model's accuracy can be as high as 85 per cent. Large language models have a role to play in hate speech detection, and with further development they could eventually be used to counter hate speech.",
            "authors": [
                "Annie Collins",
                "Rohan Alexander",
                "Ke-Li Chiu"
            ],
            "published": "2021-03-23",
            "conference": null,
            "conference_url_abs": null,
            "conference_url_pdf": null,
            "proceeding": null
        },
        {
            "id": "shrinking-bigfoot-reducing-wav2vec-2-0",
            "arxiv_id": "2103.15760",
            "nips_id": null,
            "url_abs": "https://arxiv.org/abs/2103.15760v2",
            "url_pdf": "https://arxiv.org/pdf/2103.15760v2.pdf",
            "title": "Shrinking Bigfoot: Reducing wav2vec 2.0 footprint",
            "abstract": "Wav2vec 2.0 is a state-of-the-art speech recognition model which maps speech audio waveforms into latent representations. The largest version of wav2vec 2.0 contains 317 million parameters. Hence, the inference latency of wav2vec 2.0 will be a bottleneck in production, leading to high costs and a significant environmental footprint. To improve wav2vec's applicability to a production setting, we explore multiple model compression methods borrowed from the domain of large language models. Using a teacher-student approach, we distilled the knowledge from the original wav2vec 2.0 model into a student model, which is 2 times faster and 4.8 times smaller than the original model. This increase in performance is accomplished with only a 7% degradation in word error rate (WER). Our quantized model is 3.6 times smaller than the original model, with only a 0.1% degradation in WER. To the best of our knowledge, this is the first work that compresses wav2vec 2.0.",
            "authors": [
                "Jumana Nassour",
                "Raphael Cohen",
                "Parinaz Sobhani",
                "Jason Levy",
                "Ilana Tuil",
                "Akshay Budhkar",
                "Zilun Peng"
            ],
            "published": "2021-03-29",
            "conference": null,
            "conference_url_abs": "https://aclanthology.org/2021.sustainlp-1.14",
            "conference_url_pdf": "https://aclanthology.org/2021.sustainlp-1.14.pdf",
            "proceeding": "emnlp-sustainlp-2021-11"
        },
        {
            "id": "base-layers-simplifying-training-of-large",
            "arxiv_id": "2103.16716",
            "nips_id": null,
            "url_abs": "https://arxiv.org/abs/2103.16716v1",
            "url_pdf": "https://arxiv.org/pdf/2103.16716v1.pdf",
            "title": "BASE Layers: Simplifying Training of Large, Sparse Models",
            "abstract": "We introduce a new balanced assignment of experts (BASE) layer for large language models that greatly simplifies existing high capacity sparse layers. Sparse layers can dramatically improve the efficiency of training and inference by routing each token to specialized expert modules that contain only a small fraction of the model parameters. However, it can be difficult to learn balanced routing functions that make full use of the available experts; existing approaches typically use routing heuristics or auxiliary expert-balancing loss functions. In contrast, we formulate token-to-expert allocation as a linear assignment problem, allowing an optimal assignment in which each expert receives an equal number of tokens. This optimal assignment scheme improves efficiency by guaranteeing balanced compute loads, and also simplifies training by not requiring any new hyperparameters or auxiliary losses. Code is publicly released at https://github.com/pytorch/fairseq/",
            "authors": [
                "Luke Zettlemoyer",
                "Naman Goyal",
                "Tim Dettmers",
                "Shruti Bhosale",
                "Mike Lewis"
            ],
            "published": "2021-03-30",
            "conference": null,
            "conference_url_abs": null,
            "conference_url_pdf": null,
            "proceeding": null
        },
        {
            "id": "humor-iitk-at-semeval-2021-task-7-large",
            "arxiv_id": "2104.00933",
            "nips_id": null,
            "url_abs": "https://arxiv.org/abs/2104.00933v1",
            "url_pdf": "https://arxiv.org/pdf/2104.00933v1.pdf",
            "title": "Humor@IITK at SemEval-2021 Task 7: Large Language Models for Quantifying Humor and Offensiveness",
            "abstract": "Humor and Offense are highly subjective due to multiple word senses, cultural knowledge, and pragmatic competence. Hence, accurately detecting humorous and offensive texts has several compelling use cases in Recommendation Systems and Personalized Content Moderation. However, due to the lack of an extensive labeled dataset, most prior works in this domain haven't explored large neural models for subjective humor understanding. This paper explores whether large neural models and their ensembles can capture the intricacies associated with humor/offense detection and rating. Our experiments on the SemEval-2021 Task 7: HaHackathon show that we can develop reasonable humor and offense detection systems with such models. Our models are ranked third in subtask 1b and consistently ranked around the top 33% of the leaderboard for the remaining subtasks.",
            "authors": [
                "Ashutosh Modi",
                "Lakshay Tyagi",
                "Bholeshwar Khurana",
                "Avik Pal",
                "Aishwarya Gupta"
            ],
            "published": "2021-04-02",
            "conference": null,
            "conference_url_abs": "https://aclanthology.org/2021.semeval-1.36",
            "conference_url_pdf": "https://aclanthology.org/2021.semeval-1.36.pdf",
            "proceeding": "semeval-2021"
        },
        {
            "id": "efficient-large-scale-language-model-training",
            "arxiv_id": "2104.04473",
            "nips_id": null,
            "url_abs": "https://arxiv.org/abs/2104.04473v5",
            "url_pdf": "https://arxiv.org/pdf/2104.04473v5.pdf",
            "title": "Efficient Large-Scale Language Model Training on GPU Clusters Using Megatron-LM",
            "abstract": "Large language models have led to state-of-the-art accuracies across a range of tasks. However, training these models efficiently is challenging for two reasons: a) GPU memory capacity is limited, making it impossible to fit large models on even a multi-GPU server, and b) the number of compute operations required to train these models can result in unrealistically long training times. Consequently, new methods of model parallelism such as tensor and pipeline parallelism have been proposed. Unfortunately, naive usage of these methods leads to fundamental scaling issues at thousands of GPUs, e.g., due to expensive cross-node communication or devices spending significant time waiting on other devices to make progress. In this paper, we show how different types of parallelism methods (tensor, pipeline, and data parallelism) can be composed to scale to thousands of GPUs and models with trillions of parameters. We survey techniques for pipeline parallelism and propose a novel interleaved pipeline parallelism schedule that can improve throughput by 10+% with memory footprint comparable to existing approaches. We quantitatively study the trade-offs between tensor, pipeline, and data parallelism, and provide intuition as to how to configure distributed training of a large model. Our approach allows us to perform training iterations on a model with 1 trillion parameters at 502 petaFLOP/s on 3072 GPUs with achieved per-GPU throughput of 52% of theoretical peak. Our code is open sourced at https://github.com/nvidia/megatron-lm.",
            "authors": [
                "Vijay Anand Korthikanti",
                "Matei Zaharia",
                "Amar Phanishayee",
                "Bryan Catanzaro",
                "Julie Bernauer",
                "Prethvi Kashinkunti",
                "Dmitri Vainbrand",
                "Mostofa Patwary",
                "Patrick Legresley",
                "Jared Casper",
                "Mohammad Shoeybi",
                "Deepak Narayanan"
            ],
            "published": "2021-04-09",
            "conference": null,
            "conference_url_abs": null,
            "conference_url_pdf": null,
            "proceeding": null
        },
        {
            "id": "an-efficient-2d-method-for-training-super",
            "arxiv_id": "2104.05343",
            "nips_id": null,
            "url_abs": "https://arxiv.org/abs/2104.05343v1",
            "url_pdf": "https://arxiv.org/pdf/2104.05343v1.pdf",
            "title": "An Efficient 2D Method for Training Super-Large Deep Learning Models",
            "abstract": "Huge neural network models have shown unprecedented performance in real-world applications. However, due to memory constraints, model parallelism must be utilized to host large models that would otherwise not fit into the memory of a single device. Previous methods like Megatron partition the parameters of the entire model among multiple devices, while each device has to accommodate the redundant activations in forward and backward pass. In this work, we propose Optimus, a highly efficient and scalable 2D-partition paradigm of model parallelism that would facilitate the training of infinitely large language models. In Optimus, activations are partitioned and distributed among devices, further reducing redundancy. In terms of isoefficiency, Optimus significantly outperforms Megatron. On 64 GPUs of TACC Frontera, Optimus achieves 1.48X speedup for training, 1.78X speedup for inference, and 8X increase in maximum batch size over Megatron. Optimus surpasses Megatron in scaling efficiency by a great margin. The code is available at https://github.com/xuqifan897/Optimus.",
            "authors": [
                "Yang You",
                "Chaoyu Gong",
                "Shenggui Li",
                "Qifan Xu"
            ],
            "published": "2021-04-12",
            "conference": null,
            "conference_url_abs": null,
            "conference_url_pdf": null,
            "proceeding": null
        },
        {
            "id": "multilingual-language-models-predict-human",
            "arxiv_id": "2104.05433",
            "nips_id": null,
            "url_abs": "https://arxiv.org/abs/2104.05433v1",
            "url_pdf": "https://arxiv.org/pdf/2104.05433v1.pdf",
            "title": "Multilingual Language Models Predict Human Reading Behavior",
            "abstract": "We analyze if large language models are able to predict patterns of human reading behavior. We compare the performance of language-specific and multilingual pretrained transformer models to predict reading time measures reflecting natural human sentence processing on Dutch, English, German, and Russian texts. This results in accurate models of human reading behavior, which indicates that transformer models implicitly encode relative importance in language in a way that is comparable to human processing mechanisms. We find that BERT and XLM models successfully predict a range of eye tracking features. In a series of experiments, we analyze the cross-domain and cross-language abilities of these models and show how they reflect human sentence processing.",
            "authors": [
                "Lisa Beinborn",
                "Lena Jäger",
                "Ce Zhang",
                "Federico Pirovano",
                "Nora Hollenstein"
            ],
            "published": "2021-04-12",
            "conference": null,
            "conference_url_abs": "https://aclanthology.org/2021.naacl-main.10",
            "conference_url_pdf": "https://aclanthology.org/2021.naacl-main.10.pdf",
            "proceeding": "naacl-2021-4"
        },
        {
            "id": "how-to-train-bert-with-an-academic-budget",
            "arxiv_id": "2104.07705",
            "nips_id": null,
            "url_abs": "https://arxiv.org/abs/2104.07705v2",
            "url_pdf": "https://arxiv.org/pdf/2104.07705v2.pdf",
            "title": "How to Train BERT with an Academic Budget",
            "abstract": "While large language models a la BERT are used ubiquitously in NLP, pretraining them is considered a luxury that only a few well-funded industry labs can afford. How can one train such models with a more modest budget? We present a recipe for pretraining a masked language model in 24 hours using a single low-end deep learning server. We demonstrate that through a combination of software optimizations, design choices, and hyperparameter tuning, it is possible to produce models that are competitive with BERT-base on GLUE tasks at a fraction of the original pretraining cost.",
            "authors": [
                "Omer Levy",
                "Moshe Berchansky",
                "Peter Izsak"
            ],
            "published": "2021-04-15",
            "conference": null,
            "conference_url_abs": "https://aclanthology.org/2021.emnlp-main.831",
            "conference_url_pdf": "https://aclanthology.org/2021.emnlp-main.831.pdf",
            "proceeding": "emnlp-2021-11"
        },
        {
            "id": "documenting-the-english-colossal-clean",
            "arxiv_id": "2104.08758",
            "nips_id": null,
            "url_abs": "https://arxiv.org/abs/2104.08758v2",
            "url_pdf": "https://arxiv.org/pdf/2104.08758v2.pdf",
            "title": "Documenting Large Webtext Corpora: A Case Study on the Colossal Clean Crawled Corpus",
            "abstract": "Large language models have led to remarkable progress on many NLP tasks, and researchers are turning to ever-larger text corpora to train them. Some of the largest corpora available are made by scraping significant portions of the internet, and are frequently introduced with only minimal documentation. In this work we provide some of the first documentation for the Colossal Clean Crawled Corpus (C4; Raffel et al., 2020), a dataset created by applying a set of filters to a single snapshot of Common Crawl. We begin by investigating where the data came from, and find a significant amount of text from unexpected sources like patents and US military websites. Then we explore the content of the text itself, and find machine-generated text (e.g., from machine translation systems) and evaluation examples from other benchmark NLP datasets. To understand the impact of the filters applied to create this dataset, we evaluate the text that was removed, and show that blocklist filtering disproportionately removes text from and about minority individuals. Finally, we conclude with some recommendations for how to created and document web-scale datasets from a scrape of the internet.",
            "authors": [
                "Matt Gardner",
                "Margaret Mitchell",
                "Ana Marasović",
                "Dirk Groeneveld",
                "Gabriel Ilharco",
                "William Agnew",
                "Maarten Sap",
                "Jesse Dodge"
            ],
            "published": "2021-04-18",
            "conference": null,
            "conference_url_abs": "https://aclanthology.org/2021.emnlp-main.98",
            "conference_url_pdf": "https://aclanthology.org/2021.emnlp-main.98.pdf",
            "proceeding": "emnlp-2021-11"
        },
        {
            "id": "surface-form-competition-why-the-highest",
            "arxiv_id": "2104.08315",
            "nips_id": null,
            "url_abs": "https://arxiv.org/abs/2104.08315v9",
            "url_pdf": "https://arxiv.org/pdf/2104.08315v9.pdf",
            "title": "Surface Form Competition: Why the Highest Probability Answer Isn't Always Right",
            "abstract": "Large language models have shown promising results in zero-shot settings (Brown et al.,2020; Radford et al., 2019). For example, they can perform multiple choice tasks simply by conditioning on a question and selecting the answer with the highest probability. However, ranking by string probability can be problematic due to surface form competition-wherein different surface forms compete for probability mass, even if they represent the same underlying concept, e.g. \"computer\" and \"PC.\" Since probability mass is finite, this lowers the probability of the correct answer, due to competition from other strings that are valid answers (but not one of the multiple choice options). We introduce Domain Conditional Pointwise Mutual Information, an alternative scoring function that directly compensates for surface form competition by simply reweighing each option according to a term that is proportional to its a priori likelihood within the context of the specific zero-shot task. It achieves consistent gains in zero-shot performance over both calibrated (Zhao et al., 2021) and uncalibrated scoring functions on all GPT-2 and GPT-3 models over a variety of multiple choice datasets.",
            "authors": [
                "Vered Shwartz",
                "Luke Zettlemoyer",
                "Yejin Choi",
                "Peter West",
                "Ari Holtzman"
            ],
            "published": "2021-04-16",
            "conference": null,
            "conference_url_abs": null,
            "conference_url_pdf": null,
            "proceeding": null
        },
        {
            "id": "transfer-training-from-smaller-language-model",
            "arxiv_id": "2104.11390",
            "nips_id": null,
            "url_abs": "https://arxiv.org/abs/2104.11390v1",
            "url_pdf": "https://arxiv.org/pdf/2104.11390v1.pdf",
            "title": "Transfer training from smaller language model",
            "abstract": "Large language models have led to state-of-the-art accuracies across a range of tasks. However,training large language model needs massive computing resource, as more and more open source pre-training models are available, it is worthy to study how to take full advantage of available model. We find a method to save training time and resource cost by changing the small well-trained model to large model. We initialize a larger target model from a smaller source model by copy weight values from source model and padding with zeros or small initialization values on it to make the source and target model have approximate outputs, which is valid due to block matrix multiplication and residual connection in transformer structure. We test the target model on several data sets and find it is still comparable with the source model. When we continue training the target model, the training loss can start from a smaller value.",
            "authors": [
                "Han Zhang"
            ],
            "published": "2021-04-23",
            "conference": null,
            "conference_url_abs": null,
            "conference_url_pdf": null,
            "proceeding": null
        },
        {
            "id": "predicting-the-q-factor-and-modal-volume-of",
            "arxiv_id": "2104.12145",
            "nips_id": null,
            "url_abs": "https://arxiv.org/abs/2104.12145v2",
            "url_pdf": "https://arxiv.org/pdf/2104.12145v2.pdf",
            "title": "LLM helps design and optimize photonic crystal surface emitting lasers",
            "abstract": "Conventional design and optimization of Photonic Crystal Surface Emitting Lasers (PCSEL) usually requires expert knowledge in semiconductor physics and optimization algorithms, which is also known as the inverse design problem. However, with the trend towards automation and depersonalization of the entire integrated circuits (IC) industry, the conventional method, with the drawback of being relatively labor-intensive and sub-optimal, warrants further refinement. This technical dilemma remained until the emergence of Large Language Models (LLMs), such as OpenAI's ChatGPT and Google's Bard. This paper explores the possibility of applying LLMs to machine learning-based design and optimization of PCSELs. Specifically, we utilize GPT3.5 and GPT4. By simply having conversations, GPT assisted us with writing Finite Difference Time Domain (FDTD) simulation code and deep reinforcement learning code to acquire the optimized PCSEL solution, spanning from the proposition of ideas to the realization of algorithms. Given that GPT will perform better when given detailed and specific questions, we break down the PCSEL design problem into a series of sub-problems and converse with GPT by posing open-ended heuristic questions rather than definitive commands. This paper shows that LLMs, such as ChatGPT, can guide the nanophotonic design and optimization processes, on both the conceptual and technical level, and we propose new human-AI co-design strategies and show their practical implications. We achieve a significant milestone for the first step towards an automated end to end nanophotonic design and production pipeline.",
            "authors": [],
            "published": "2021-04-25",
            "conference": null,
            "conference_url_abs": null,
            "conference_url_pdf": null,
            "proceeding": null
        },
        {
            "id": "evaluating-groundedness-in-dialogue-systems",
            "arxiv_id": "2105.00071",
            "nips_id": null,
            "url_abs": "https://arxiv.org/abs/2105.00071v3",
            "url_pdf": "https://arxiv.org/pdf/2105.00071v3.pdf",
            "title": "Evaluating Attribution in Dialogue Systems: The BEGIN Benchmark",
            "abstract": "Knowledge-grounded dialogue systems powered by large language models often generate responses that, while fluent, are not attributable to a relevant source of information. Progress towards models that do not exhibit this issue requires evaluation metrics that can quantify its prevalence. To this end, we introduce the Benchmark for Evaluation of Grounded INteraction (BEGIN), comprised of 12k dialogue turns generated by neural dialogue systems trained on three knowledge-grounded dialogue corpora. We collect human annotations assessing the extent to which the models' responses can be attributed to the given background information. We then use BEGIN to analyze eight evaluation metrics. We find that these metrics rely on spurious correlations, do not reliably distinguish attributable abstractive responses from unattributable ones, and perform substantially worse when the knowledge source is longer. Our findings underscore the need for more sophisticated and robust evaluation metrics for knowledge-grounded dialogue. We make BEGIN publicly available at https://github.com/google/BEGIN-dataset.",
            "authors": [
                "David Reitter",
                "Tal Linzen",
                "Hannah Rashkin",
                "Nouha Dziri"
            ],
            "published": "2021-04-30",
            "conference": null,
            "conference_url_abs": null,
            "conference_url_pdf": null,
            "proceeding": null
        }
    ]
}