[
  {
    "abstract": "MuSe Net is a deep learning framework that mimics expert annotation of 1H NMR spectra. It classifies multiplet splitting phenotypes using spectral data alone, handles up to four coupling constants, and provides segmentation with confidence scores. Evaluated on 48 expert-annotated spectra, it accurately detects multiplets, overlapping peaks, and anomalies.",
    "author": "Fischetti, Giulia; Schmid, Nicolas; Bruderer, Simon; Heitmann, Björn; Henrici, Andreas; Scarso, Alessandro; Caldarelli, Guido; Wilhelm, Dirk",
    "doi": "10.1016/j.jmr.2025.107851",
    "id": "Fischetti-2025-MultipletDL",
    "journal": "Journal of Magnetic Resonance",
    "keywords": "deep learning, NMR, multiplet classification",
    "pages": "107851",
    "title": "A deep learning framework for multiplet splitting classification in 1H NMR",
    "type": "article",
    "url": "https://doi.org/10.1016/j.jmr.2025.107851",
    "volume": "373",
    "year": 2025
  },
  {
    "abstract": "Reviews machine learning applications in NMR, including signal detection, chemical shift assignment, structure determination, prediction, NUS reconstruction, and denoising; outlines design considerations, key datasets, and trends across solution and solid-state studies.",
    "author": "Klukowski, Piotr; Riek, Roland; Güntert, Peter",
    "doi": "10.1016/j.pnmrs.2025.101575",
    "id": "Klukowski-2025-MachineLearningNMR",
    "journal": "Progress in Nuclear Magnetic Resonance Spectroscopy",
    "pages": "101575",
    "title": "Machine learning in NMR spectroscopy",
    "type": "article",
    "url": "https://doi.org/10.1016/j.pnmrs.2025.101575",
    "volume": "148–149",
    "year": 2025
  },
  {
    "abstract": "Pistachio is a curated database of millions of chemical reactions automatically extracted from patents (USPTO, EPO, WIPO), maintained by NextMove Software and used in cheminformatics, synthesis planning, and machine learning applications.",
    "author": "{NextMove Software}",
    "id": "NextMove-2025-Pistachio",
    "title": "Pistachio (reaction data, querying and analytics)",
    "type": "misc",
    "url": "https://www.nextmovesoftware.com/pistachio.html",
    "year": 2025
  },
  {
    "abstract": "fragSMILES is a fragment-based molecular string representation that improves over SMILES and SELFIES by encoding chemically meaningful fragments and chirality through a graph-reduction approach. It enables interpretable and efficient de novo molecular design with enhanced synthetic accessibility and scaffold diversity.",
    "author": "Mastrolorito, Fabrizio; Ciriaco, Fulvio; Togo, Maria Vittoria; Gambacorta, Nicola; Trisciuzzi, Daniela; Altomare, Cosimo Damiano; Amoroso, Nicola; Grisoni, Francesca; Nicolotti, Orazio",
    "doi": "10.1038/s42004-025-01423-3",
    "id": "Mastrolorito-2025-fragSMILES",
    "journal": "Communications Chemistry",
    "pages": "Article 26",
    "title": "fragSMILES as a chemical string notation for advanced fragment and chirality representation",
    "type": "article",
    "url": "https://doi.org/10.1038/s42004-025-01423-3",
    "volume": "8",
    "year": 2025
  },
  {
    "abstract": "The integration of large language models (LLMs) into drug design is gaining momentum; however, existing approaches often struggle to effectively incorporate three-dimensional molecular structures. Here, we present Token-Mol, a token-only 3D drug design model that encodes both 2D and 3D structural information, along with molecular properties, into discrete tokens. Built on a transformer decoder and trained with causal masking, Token-Mol introduces a Gaussian cross-entropy loss function tailored for regression tasks, enabling superior performance across multiple downstream applications. The model surpasses existing methods, improving molecular conformation generation by over 10% and 20% across two datasets, while outperforming token-only models by 30% in property prediction. In pocket-based molecular generation, it enhances drug-likeness and synthetic accessibility by approximately 11% and 14%, respectively. Notably, Token-Mol operates 35 times faster than expert diffusion models. In real-world validation, it improves success rates and, when combined with reinforcement learning, further optimizes affinity and drug-likeness, advancing AI-driven drug discovery.",
    "author": "Wang, Jike; Qin, Rui; Wang, Mingyang; Fang, Meijing; Zhang, Yangyang; Zhu, Yuchen; Su, Qun; Gou, Qiaolin; Shen, Chao; Zhang, Odin; Wu, Zhenxing; Jiang, Dejun; Zhang, Xujun; Zhao, Huifeng; Ge, Jingxuan; Wu, Zhourui; Kang, Yu; Hsieh, Chang-Yu; Hou, Tingjun",
    "doi": "10.1038/s41467-025-59628-y",
    "id": "Hou-2025-TokenMol",
    "journal": "Nature Communications",
    "pages": "Article 4416",
    "title": "Token-Mol 1.0: tokenized drug design with large language models",
    "type": "article",
    "url": "https://doi.org/10.1038/s41467-025-59628-y",
    "volume": "16",
    "year": 2025
  },
  {
    "abstract": "This review outlines the fundamentals of deep learning and surveys its applications in NMR spectroscopy. It addresses key challenges like low sensitivity and slow acquisition in multidimensional spectra, discusses the limitations of traditional algorithms, and explores how DL enhances speed and accuracy. The paper also highlights current obstacles and future opportunities for DL integration in NMR.",
    "author": "Luo, Yao; Zheng, Xiaoxu; Qiu, Mengjie; Gou, Yaoping; Yang, Zhengxian; Qu, Xiaobo; Chen, Zhong; Lin, Yanqin",
    "doi": "10.1016/j.pnmrs.2024.101556",
    "id": "Luo-2025-DeepNMRApps",
    "journal": "Progress in Nuclear Magnetic Resonance Spectroscopy",
    "pages": "101556",
    "title": "Deep learning and its applications in nuclear magnetic resonance spectroscopy",
    "type": "article",
    "url": "https://doi.org/10.1016/j.pnmrs.2024.101556",
    "volume": "146–147",
    "year": 2025
  },
  {
    "abstract": "Phosphorus-31 NMR spectroscopy is essential for analyzing phosphorus-containing compounds, but its interpretation is time-intensive and expert-dependent. This study presents a data-driven model that automates ³¹P NMR spectral analysis, achieving 53.6% Top-1 and 77.7% Top-5 accuracy in predicting local phosphorus environments. The approach outperforms expert chemists by 25% and remains robust across solvents. All models and datasets are openly available, supporting widespread adoption in structural elucidation workflows.",
    "author": "Alberts, Marvin; Hartrampf, Nina; Laino, Teodoro",
    "doi": "10.1021/acs.analchem.5c01460",
    "id": "Alberts-2025-NMR",
    "journal": "Analytical Chemistry",
    "title": "From spectra to structure: AI-powered ³¹P NMR interpretation",
    "type": "article",
    "url": "https://doi.org/10.1021/acs.analchem.5c01460",
    "year": 2025
  },
  {
    "abstract": "Nuclear Magnetic Resonance (NMR) spectroscopy is a central characterization method for molecular structure elucidation, yet interpreting NMR spectra to deduce molecular structures remains challenging due to the complexity of spectral data and the vastness of the chemical space. In this work, we introduce DiffNMR, a novel end-to-end framework that leverages a conditional discrete diffusion model for de novo molecular structure elucidation from NMR spectra. DiffNMR refines molecular graphs iteratively through a diffusion-based generative process, ensuring global consistency and mitigating error accumulation inherent in autoregressive methods. The framework integrates a two-stage pretraining strategy that aligns spectral and molecular representations via diffusion autoencoder (Diff-AE) and contrastive learning, the incorporation of retrieval initialization and similarity filtering during inference, and a specialized NMR encoder with radial basis function (RBF) encoding for chemical shifts, preserving continuity and chemical correlation. Experimental results demonstrate that DiffNMR achieves competitive performance for NMR-based structure elucidation, offering an efficient and robust solution for automated molecular analysis.",
    "author": "Yang, Qingsong; Wu, Binglan; Liu, Xuwei; Chen, Bo; Li, Wei; Long, Gen; Chen, Xin; Xiao, Mingjun",
    "doi": "arXiv:2507.08854",
    "id": "Yang-2025-DiffNMR",
    "journal": "arXiv",
    "number": "2507.08854",
    "title": "DiffNMR: Diffusion models for NMR spectra elucidation",
    "type": "article",
    "url": "https://arxiv.org/abs/2507.08854",
    "year": 2025
  },
  {
    "author": "{PyTorch Core Team}",
    "id": "PyTorch-2025-AdamWDoc",
    "keywords": "optimizer, AdamW, fused kernel, PyTorch, CUDA",
    "note": "Accessed July 2025. Describes fused=True support in AdamW optimizer, enabling fused CUDA kernels for GPU acceleration.",
    "publisher": "PyTorch Foundation",
    "title": "PyTorch documentation: AdamW optimizer",
    "type": "manual",
    "url": "https://pytorch.org/docs/stable/generated/torch.optim.AdamW.html",
    "year": 2025
  },
  {
    "abstract": "Generative tasks about molecules, including but not limited to molecule generation, are crucial for drug discovery and material design, and have consistently attracted significant attention. In recent years, diffusion models have emerged as an impressive class of deep generative models, sparking extensive research and leading to numerous studies on their application to molecular generative tasks. Despite the proliferation of related work, there remains a notable lack of up-to-date and systematic surveys in this area. Particularly, due to the diversity of diffusion model formulations, molecular data modalities, and generative task types, the research landscape is challenging to navigate, hindering understanding and limiting the area's growth. To address this, this paper conducts a comprehensive survey of diffusion model-based molecular generative methods. We systematically review the research from the perspectives of methodological formulations, data modalities, and task types, offering a novel taxonomy. This survey aims to facilitate understanding and further flourishing development in this area. The relevant papers are summarized at: https://github.com/AzureLeon1/awesome-molecular-diffusion-models.",
    "author": "Wang, Liang; Song, Chao; Liu, Zhiyuan; Rong, Yu; Liu, Qiang; Wu, Shu",
    "doi": "10.48550/arXiv.2502.09511",
    "id": "Wang-2025-DiffusionSurvey",
    "journal": "arXiv",
    "note": "Preprint; no peer-reviewed version known yet.",
    "title": "Diffusion models for molecules: A survey of methods and tasks",
    "type": "article",
    "url": "https://arxiv.org/abs/2502.09511",
    "volume": "2502.09511",
    "year": 2025
  },
  {
    "abstract": "PubChem (https://pubchem.ncbi.nlm.nih.gov) is a large and highly-integrated public chemical database resource at NIH. In the past two years, significant updates were made to PubChem. With additions from over 130 new sources, PubChem contains >1000 data sources, 119 million compounds, 322 million substances and 295 million bioactivities. New interfaces, such as the consolidated literature panel and the patent knowledge panel, were developed. The consolidated literature panel combines all references about a compound into a single list, allowing users to easily find, sort, and export all relevant articles for a chemical in one place. The patent knowledge panels for a given query chemical or gene display chemicals, genes, and diseases co-mentioned with the query in patent documents, helping users to explore relationships between co-occurring entities within patent documents. PubChemRDF was expanded to include the co-occurrence data underlying the literature knowledge panel, enabling users to exploit semantic web technologies to explore entity relationships based on the co-occurrences in the scientific literature. The usability and accessibility of information on chemicals with non-discrete structures (e.g. biologics, minerals, polymers, UVCBs and glycans) were greatly improved with dedicated web pages that provide a comprehensive view of all available information in PubChem for these chemicals.",
    "author": "Kim, Sunghwan; Chen, Jie; Cheng, Tiejun; Gindulyte, Asta; He, Jia; He, Siqian; Li, Qingliang; Shoemaker, Benjamin A.; Thiessen, Paul A.; Yu, Bo; Zaslavsky, Leonid; Zhang, Jian; Bolton, Evan E.",
    "doi": "10.1093/nar/gkae1059",
    "id": "Kim-2025-PubChemUpdate",
    "journal": "Nucleic Acids Research",
    "number": "D1",
    "pages": "D1516–D1525",
    "title": "PubChem 2025 update",
    "type": "article",
    "url": "https://doi.org/10.1093/nar/gkae1059",
    "volume": "53",
    "year": 2025
  },
  {
    "author": "Andreev, R.",
    "id": "numpde-2025-nmr-to-structure-lite",
    "keywords": "NMR, SMILES, structure prediction, GitHub, transformer, numpde",
    "title": "Lightweight repo to train/evaluate NMR-to-SMILES models by Alberts et al.",
    "type": "online",
    "url": "https://github.com/numpde/nmr-to-structure-lite",
    "year": 2025
  },
  {
    "abstract": "Chemical language models (CLMs) have shown strong performance in molecular property prediction and generation tasks. However, the impact of design choices, such as molecular representation format, tokenization strategy, and model architecture, on both performance and chemical interpretability remains underexplored. In this study, we systematically evaluate how these factors influence CLM performance and chemical understanding. We evaluated models through finetuning on downstream tasks and probing the structure of their latent spaces using simple classifiers and dimensionality reduction techniques. Despite similar performance on downstream tasks across model configurations, we observed substantial differences in the structure and interpretability of their internal representations. SMILES molecular representation format with atomwise tokenization strategy consistently produced more chemically meaningful embeddings, while models based on BART and RoBERTa architectures yielded comparably interpretable representations. These findings highlight that design choices meaningfully shape how chemical information is represented, even when external metrics appear unchanged.",
    "author": "Fender, Inken; Gut, Jannik Adrian; Lemmin, Thomas",
    "doi": "10.1101/2025.05.23.655735",
    "id": "Fender-2025-ChemLangModels",
    "journal": "bioRxiv",
    "keywords": "chemical language models, SMILES, molecular representation, deep learning, interpretability, tokenization, BART, RoBERTa",
    "title": "Beyond performance: How design choices shape chemical language models",
    "type": "article",
    "url": "https://www.biorxiv.org/content/10.1101/2025.05.23.655735v1",
    "year": 2025
  },
  {
    "abstract": "Most SMILES-based molecular language models use token-level masked language modeling (MLM), which saturates quickly and lacks substructure-level understanding. SMI-Editor introduces an edit-based pretraining strategy that randomly deletes substructures and trains a model to reconstruct them, enabling better learning of molecular fragments. It achieves state-of-the-art performance on property prediction tasks and demonstrates better scaling efficiency.",
    "author": "Zheng, Kangjie; Liang, Siyue; Yang, Junwei; Feng, Bin; Liu, Zequn; Ju, Wei; Xiao, Zhiping; Zhang, Ming",
    "id": "Zheng-2025-SMIEditor",
    "journal": "ICLR (poster, OpenReview)",
    "keywords": "SMILES, molecular language model, fragment-level supervision, edit-based pretraining, property prediction",
    "title": "SMI-Editor: Edit-based SMILES language model with fragment-level supervision",
    "type": "article",
    "url": "https://openreview.net/forum?id=M29nUGozPa",
    "year": 2025
  },
  {
    "abstract": "The paper presents a multimodal multitask transformer model for predicting molecular structures from spectroscopic data, including 1H-NMR, 13C-NMR, and IR spectra. Trained on simulated and finetuned on experimental data, the model achieves Top-1 prediction accuracy up to 96%, matching expert-level performance. Multitask learning and unpaired data integration enhance model generalizability and efficiency.",
    "author": "Alberts, Marvin; Hartrampf, Nina; Laino, Teodoro",
    "doi": "10.26434/chemrxiv-2025-q80r9",
    "id": "Alberts-2025-MultimodalNMR",
    "journal": "ChemRxiv",
    "keywords": "structure elucidation, NMR, IR spectroscopy, machine learning, transformer model, multitask learning, multimodal learning, cheminformatics",
    "note": "",
    "title": "Automated structure elucidation at human-level accuracy via a multimodal multitask language model",
    "type": "article",
    "url": "https://doi.org/10.26434/chemrxiv-2025-q80r9",
    "year": 2025
  },
  {
    "abstract": "Identifying molecular structure based on spectroscopic readings is a key task in a variety of chemical and biological applications. Common spectroscopy techniques, such as Infrared (IR) Spectroscopy and Mass Spectrometry (MS), provide detailed information on the structure of molecular compounds but nonetheless require expert-level knowledge to decode. Machine learning has emerged as a potential solution for automating structure prediction from chemical spectra, however current approaches generally focus on single sensor modalities, neglecting to leverage the complementary information contained within differing spectra. In this paper, we introduce Peak2Patch, a novel approach to fusion-enhanced prediction of functional groups from IR and mass spectra. First, we perform a detailed comparison of backbone networks for encoding both sparse mass spectra and dense IR spectra and demonstrate the superior performance of transformer neural networks over current state-of-the-art convolutional neural networks. Second, we evaluate three broad categories of fusion: early (raw feature), middle (deep feature), and late (decision) fusion, demonstrating the potential of a deep feature fusion-based approach. Lastly, we present Peak2Patch, our attention-based fusion scheme which leverages cross-attention to mix features between encoded tokens of the two modalities. We validate our approach on a publicly available multimodal dataset of 790k simulated molecules, demonstrating a large improvement on functional group prediction over both the previous state-of-the-art and our own strong single-modal baselines.",
    "author": "Jacobson, Philip; Kumar, Suhas; Krishnakumar, Raga; Timlin, Jerilyn A.",
    "doi": "10.26434/chemrxiv-2025-7329c",
    "id": "Jacobson-2025-Peak2Patch",
    "journal": "ChemRxiv",
    "note": "Preprint; not peer-reviewed",
    "title": "Peak2Patch: High-Fidelity Functional Group Identification Through Attention-based Fusion of Infrared and Mass Spectra",
    "type": "article",
    "url": "https://doi.org/10.26434/chemrxiv-2025-7329c",
    "year": 2025
  },
  {
    "abstract": "Drug-target Interaction (DTI) prediction provides strong support for drug repurposing. Most of the existing models are designed for specific protein / drug datasets to identify decoys, but there is still room for improvement in the generalization capability. Therefore, a universal DTI framework supporting Multiple Data Foundations with Alternative Training and Contrastive Learning (MuFAl) is proposed, learning diverse information such as protein structure and sequence from heterogeneous data sources. MuFAl makes predictions based on the representation distance in vector spaces, with improved generalization capability brought by alternative training and fine-grained recognition ability of decoys provided by contrastive learning. This model could be used for scenarios including large-scale heterogeneous data processing or zero-shot predictions. Experimental results indicate that compared with some state-of-the-art models, MuFAl retains the fine-grained feature extraction ability while improving the generalization ability.",
    "author": "Chen, Xinyuan; Husen, Mohd Nizam; Huang, Xuxia",
    "booktitle": "2025 19th International Conference on Ubiquitous Information Management and Communication (IMCOM)",
    "doi": "10.1109/IMCOM64595.2025.10857588",
    "id": "Chen-2025-MuFAl",
    "keywords": "Drugs;Training;Proteins;Soft sensors;Contrastive learning;Predictive models;Feature extraction;Vectors;Compounds;Diffusion tensor imaging;DTI;PLM;molecular fingerprint;alternative training;contrastive learning",
    "pages": "1-6",
    "title": "MuFAl: A Universal Drug-Target Interaction Prediction Framework",
    "type": "conference",
    "url": "https://ieeexplore.ieee.org/stamp/stamp.jsp?tp=&arnumber=10857588&isnumber=10857432",
    "year": 2025
  },
  {
    "abstract": "Background: Understanding the molecular properties of chemical compounds is essential for identifying potential candidates or ensuring safety in drug discovery. However, exploring the vast chemical space is time-consuming and costly, necessitating the development of time-efficient and cost-effective computational methods. Recent advances in deep learning approaches have offered deeper insights into molecular structures. Leveraging this progress, we developed a novel multi-view learning model.\n\nResults: We introduce a graph-integrated model that captures both local and global structural features of chemical compounds. In our model, graph attention layers are employed to effectively capture essential local structures by jointly considering atom and bond features, while multi-head attention layers extract important global features. We evaluated our model on nine MoleculeNet datasets, encompassing both classification and regression tasks, and compared its performance with state-of-the-art methods. Our model achieved an average area under the receiver operating characteristic (AUROC) of 0.822 and a root mean squared error (RMSE) of 1.133, representing a 3% improvement in AUROC and a 7% improvement in RMSE over state-of-the-art models in extensive seed testing.",
    "author": "Moon, Heesang; Rho, Mina",
    "doi": "10.1186/s13040-024-00419-4",
    "id": "Moon-2025-MultiChem",
    "journal": "BioData Mining",
    "keywords": "MultiChem, Graph Attention Network, Molecular Property Prediction, Deep Learning, Drug Discovery, MoleculeNet",
    "number": "4",
    "title": "MultiChem: predicting chemical properties using multi-view graph attention network",
    "type": "article",
    "url": "https://doi.org/10.1186/s13040-024-00419-4",
    "volume": "18",
    "year": 2025
  },
  {
    "abstract": "Mass spectrometry plays a fundamental role in elucidating the structures of unknown molecules and subsequent scientific discoveries. One formulation of the structure elucidation task is the conditional *de novo* generation of molecular structure given a mass spectrum. Toward a more accurate and efficient scientific discovery pipeline for small molecules, we present DiffMS, a formula-restricted encoder-decoder generative network that achieves state-of-the-art performance on this task. The encoder utilizes a transformer architecture and models mass spectra domain knowledge such as peak formulae and neutral losses, and the decoder is a discrete graph diffusion model restricted by the heavy-atom composition of a known chemical formula. To develop a robust decoder that bridges latent embeddings and molecular structures, we pretrain the diffusion decoder with fingerprint-structure pairs, which are available in virtually infinite quantities, compared to structure-spectrum pairs that number in the tens of thousands. Extensive experiments on established benchmarks show that DiffMS outperforms existing models on *de novo* molecule generation. We provide several ablations to demonstrate the effectiveness of our diffusion and pretraining approaches and show consistent performance scaling with increasing pretraining dataset size. DiffMS code is publicly available at this link: https://github.com/coleygroup/DiffMS.",
    "author": "Bohde, Montgomery; Manjrekar, Mrunali; Wang, Runzhong; Ji, Shuiwang; Coley, Connor W.",
    "doi": "10.48550/arXiv.2502.09571",
    "id": "Bohde-2025-DiffMS",
    "journal": "arXiv",
    "title": "DiffMS: Diffusion Generation of Molecules Conditioned on Mass Spectra",
    "type": "article",
    "url": "https://arxiv.org/abs/2502.09571",
    "year": 2025
  },
  {
    "abstract": "The rapid advent of machine learning (ML) and artificial intelligence (AI) has catalyzed major transformations in chemistry, yet the application of these methods to spectroscopic and spectrometric data, referred to as *Spectroscopy Machine Learning (SpectraML)*, remains relatively underexplored. Modern spectroscopic techniques (MS, NMR, IR, Raman, UV-Vis) generate an ever-growing volume of high-dimensional data, creating a pressing need for automated and intelligent analysis beyond traditional expert-based workflows.\n\nIn this survey, we provide a unified review of SpectraML, systematically examining state-of-the-art approaches for both forward tasks (molecule-to-spectrum prediction) and inverse tasks (spectrum-to-molecule inference). We trace the historical evolution of ML in spectroscopy, from early pattern recognition to the latest foundation models capable of advanced reasoning, and offer a taxonomy of representative neural architectures, including graph-based and transformer-based methods. Addressing key challenges such as data quality, multimodal integration, and computational scalability, we highlight emerging directions such as synthetic data generation, large-scale pretraining, and few- or zero-shot learning.\n\nTo foster reproducible research, we also release an open-source repository containing recent papers and their corresponding curated datasets.",
    "author": "Guo, Kehan; Shen, Yili; Gonzalez-Montiel, Gisela Abigail; Huang, Yue; Zhou, Yujun; Surve, Mihir; Guo, Zhichun; Das, Prayel; Chawla, Nitesh V.; Wiest, Olaf; Zhang, Xiangliang",
    "doi": "10.48550/arXiv.2502.09897",
    "id": "Guo-2025-SpectraML",
    "journal": "arXiv",
    "number": "2502.09897",
    "title": "Artificial intelligence in spectroscopy: Advancing chemistry from prediction to generation and beyond",
    "type": "article",
    "url": "https://arxiv.org/abs/2502.09897",
    "volume": "",
    "year": 2025
  },
  {
    "author": "Zhou, Zhouao; Liao, Xinli; Qiu, Xu; Zhang, Yue; Dong, Jiyang; Qu, Xiaobo; Lin, Donghai",
    "doi": "10.1021/acs.analchem.4c05632",
    "id": "Zhou-2025-NMRformer",
    "journal": "Analytical Chemistry",
    "note": "NMRformer code is available at: https://github.com/zza1211/NMRformer/blob/main/net.py",
    "number": "1",
    "publisher": "American Chemical Society (ACS)",
    "title": "NMRformer: A Transformer-Based Deep Learning Framework for Peak Assignment in 1D ¹H NMR Spectroscopy",
    "type": "article",
    "url": "https://doi.org/10.1021/acs.analchem.4c05632",
    "volume": "97",
    "year": 2025
  },
  {
    "author": "Domżał, Barbara; Grochowska-Tatarczak, Magdalena; Malinowski, Przemysław; Miasojedow, Błażej; Kazimierczuk, Krzysztof; Gambin, Anna",
    "doi": "10.26434/chemrxiv-2025-pjzl3",
    "id": "Domzal-2025-WassersteinNMR",
    "journal": "ChemRxiv",
    "title": "NMR reaction monitoring robust to spectral distortions",
    "type": "article",
    "url": "https://doi.org/10.26434/chemrxiv-2025-pjzl3",
    "year": 2025
  },
  {
    "abstract": "Proposes a source–target transformer model trained on 200B molecular pairs for exploring chemical neighborhoods, regularized with a similarity kernel to better correlate generation probability with molecular similarity.",
    "author": "Tibo, Alessandro; He, Jiazhen; Janet, Jon Paul; Nittinger, Eva; Engkvist, Ola",
    "doi": "10.1038/s41467-024-51672-4",
    "id": "Tibo-2024-ChemSpaceTransformer",
    "journal": "Nature Communications",
    "pages": "Article 7315",
    "title": "Exhaustive local chemical space exploration using a transformer model",
    "type": "article",
    "url": "https://www.nature.com/articles/s41467-024-51672-4",
    "volume": "15",
    "year": 2024
  },
  {
    "abstract": "This study evaluates simulation and matching strategies for HSQC spectra in molecular identification, comparing ACD/Labs, MestReNova, DFT, and a GNN-based model. Combining 1D spectra reconstruction and advanced peak-matching techniques, it enables accurate identification among analogues and supports database searching and structural revision. A Colab notebook is available for public use.",
    "author": "Priessner, Martin; Lewis, Richard J.; Johansson, Magnus J.; Goodman, Jonathan M.; Janet, Jon Paul; Tomberg, Anna",
    "doi": "10.1021/acs.jcim.3c01735",
    "id": "Priessner-2024-HSQCMatch",
    "journal": "Journal of Chemical Information and Modeling",
    "number": "8",
    "pages": "3180–3191",
    "title": "HSQC spectra simulation and matching for molecular identification",
    "type": "article",
    "url": "https://doi.org/10.1021/acs.jcim.3c01735",
    "volume": "64",
    "year": 2024
  },
  {
    "abstract": "This study evaluates SMILES and SELFIES tokenization techniques using Byte Pair Encoding (BPE) and Atom Pair Encoding (APE) in BERT-based models. APE, especially when paired with SMILES, significantly improves classification performance on biophysical and physiological datasets, highlighting the importance of tokenization in chemical language modeling for drug discovery.",
    "author": "Leon, Miguelangel; Perezhohin, Yuriy; Peres, Fernando; Popovič, Aleš; Castelli, Mauro",
    "doi": "10.1038/s41598-024-76440-8",
    "id": "Leon-2024-Tokenization",
    "journal": "Scientific Reports",
    "pages": "Article 25016",
    "title": "Comparing SMILES and SELFIES tokenization for enhanced chemical language modeling",
    "type": "article",
    "url": "https://www.nature.com/articles/s41598-024-76440-8",
    "volume": "14",
    "year": 2024
  },
  {
    "abstract": "MultiModalTransformer (MMT) is a deep learning model for molecular structure prediction from multiple spectroscopic modalities (1H-NMR, 13C-NMR, HSQC, COSY, IR, MS). Trained on simulated data, it achieves up to 94% accuracy on experimental samples. MMT employs attention mechanisms, supports noisy inputs, and enables structure improvement from partial guesses. A GUI facilitates lab integration, and explainability is supported via token-level analysis. The method underscores the importance of chemical space diversity in training over perfect spectral fidelity.",
    "author": "Priessner, Martin; Lewis, Richard; Janet, Jon Paul; Lemurell, Isak; Johansson, Magnus; Goodman, Jonathan; Tomberg, Anna",
    "doi": "10.26434/chemrxiv-2024-zmmnw",
    "id": "Priessner-2024-MMT",
    "journal": "ChemRxiv",
    "title": "Enhancing molecular structure elucidation: MultiModalTransformer for both simulated and experimental spectra",
    "type": "article",
    "url": "https://doi.org/10.26434/chemrxiv-2024-zmmnw",
    "year": 2024
  },
  {
    "abstract": "Structure Seer is a machine learning model for chemical structure elucidation using atom-level isotropic shielding constants instead of traditional NMR shifts. By predicting atom connectivity from elemental composition and quantum-calculated properties, it circumvents limitations of small experimental datasets. Trained on QM9 and PubChem-derived structures, the model predicts chemical bonds with high accuracy and supports tasks like NMR peak attribution and isomer ranking.",
    "author": "Sapegin, Denis A.; Bear, Joseph C.",
    "doi": "10.1039/d3dd00178d",
    "id": "Sapegin-2024-StructureSeer",
    "journal": "Digital Discovery",
    "number": "1",
    "pages": "186–200",
    "title": "Structure Seer – a machine learning model for chemical structure elucidation from node labelling of a molecular graph",
    "type": "article",
    "url": "https://doi.org/10.1039/d3dd00178d",
    "volume": "3",
    "year": 2024
  },
  {
    "abstract": "Graph neural networks (GNNs) are powerful tools for modeling graph-structured data, used in high-stakes domains such as finance, transportation, and drug discovery. However, GNNs can also leak private information, be misled by adversarial attacks, amplify societal biases, and lack interpretability—posing risks to users and society. This survey comprehensively reviews efforts to build trustworthy GNNs across four dimensions: privacy, robustness, fairness, and explainability. For each, it proposes taxonomies, highlights representative methods, and outlines future research directions to enhance GNN reliability and trust.",
    "author": "Dai, Enyan; Zhao, Tianxiang; Zhu, Huaisheng; Xu, Junjie; Guo, Zhimeng; Liu, Hui; Tang, Jiliang; Wang, Suhang",
    "doi": "10.1007/s11633-024-1510-8",
    "id": "Dai-2024-TrustGNN",
    "journal": "Machine Intelligence Research",
    "keywords": "graph neural networks; GNN; trustworthiness; privacy; robustness; fairness; explainability; survey",
    "pages": "1011–1061",
    "title": "A Comprehensive Survey on Trustworthy Graph Neural Networks: Privacy, Robustness, Fairness, and Explainability",
    "type": "article",
    "url": "https://doi.org/10.1007/s11633-024-1510-8",
    "volume": "21",
    "year": 2024
  },
  {
    "abstract": "AdamW modifies Adam by decoupling weight decay from the adaptive update, avoiding interference with gradient moments. This paper proves convergence of AdamW, showing it minimizes a dynamically regularized loss. The authors establish improved gradient complexity bounds for AdamW, Adam, and ℓ2-Adam on nonconvex and PŁ-conditioned problems, and demonstrate that AdamW achieves lower generalization error from a Bayesian perspective. Experiments validate the theoretical findings.",
    "author": "Zhou, Pan; Xie, Xingyu; Lin, Zhouchen; Yan, Shuicheng",
    "doi": "10.1109/TPAMI.2024.3382294",
    "id": "Zhou-2024-AdamWConvergence",
    "journal": "IEEE Transactions on Pattern Analysis and Machine Intelligence",
    "number": "9",
    "pages": "6486-6493",
    "title": "Towards understanding convergence and generalization of AdamW",
    "type": "article",
    "url": "https://doi.org/10.1109/TPAMI.2024.3382294",
    "volume": "46",
    "year": 2024
  },
  {
    "abstract": "Text-based foundation models have become an important part of scientific discovery, with molecular foundation models accelerating advancements in molecular design and materials science. However, existing models are constrained by closed-vocabulary tokenizers which capture only a fraction of molecular space. In this work, we systematically evaluate thirty tokenizers, including 19 chemistry-specific ones, for their coverage of the SMILES molecular representation language, revealing significant gaps. To assess the impact of tokenizer choice, we introduce n-gram language models as a low-cost proxy and validate their effectiveness by training and fine-tuning 18 RoBERTa-style encoders for molecular property prediction. To overcome the limitations of existing tokenizers, we propose two new tokenizers -- Smirk and Smirk-GPE -- with full coverage of the OpenSMILES specification. Our results highlight the need for open-vocabulary modeling and chemically diverse benchmarks in cheminformatics. The proposed tokenizer framework systematically integrates nuclear, electronic, and geometric degrees of freedom; this facilitates applications in pharmacology, agriculture, biology, and energy storage.",
    "author": "Wadell, Alexius; Bhutani, Anoushka; Viswanathan, Venkatasubramanian",
    "doi": "10.48550/arXiv.2409.15370",
    "id": "Wadell-2024-Smirk",
    "journal": "arXiv",
    "keywords": "tokenizer, molecular foundation models, SMILES, cheminformatics, language models, OpenSMILES",
    "number": "2409.15370",
    "title": "Smirk: An atomically complete tokenizer for molecular foundation models",
    "type": "article",
    "url": "https://arxiv.org/abs/2409.15370",
    "volume": "",
    "year": 2024
  },
  {
    "abstract": "The tokenizer is a fundamental yet often-overlooked component of large language models (LLMs), acting as a preprocessing stage that maps strings to token IDs and vice versa. In this comprehensive lecture, Andrej Karpathy builds a GPT-style tokenizer from scratch, explaining its role, training (using byte pair encoding), and implementation. The lecture explores tokenization’s impact on LLM behavior, peculiarities, and limitations, and it introduces tools like `tiktoken` and `sentencepiece`, encouraging deeper understanding of this critical system layer.",
    "author": "Karpathy, Andrej",
    "id": "Karpathy-2024-Tokenizer",
    "note": "Published February 20, 2024",
    "publisher": "YouTube",
    "title": "Let's build the GPT tokenizer (YouTube video)",
    "type": "online",
    "url": "https://youtu.be/zduSFxRajkE",
    "year": 2024
  },
  {
    "abstract": "The reliability of chemical language models (ChemLMs) in capturing molecular meaning beyond surface-level SMILES patterns remains an open question. A zero-shot diagnostic framework, AMORE, is introduced to assess the robustness of ChemLMs using chemically invariant perturbations of SMILES strings—such as kekulization and cycle renumbering—that preserve molecular identity while altering syntactic form.\n\nModel consistency is evaluated by comparing embedding similarity between original and perturbed representations. It is observed that common ChemLMs, including ChemBERTa and ChemT5, produce notably divergent embeddings in response to minor, chemically preserving string changes. This sensitivity is evident across several benchmarks, including QM9 and MoleculeNet, and is especially pronounced in models trained exclusively on textual molecular representations. In contrast, graph-based models such as GIN demonstrate robustness under the same conditions.\n\nOn cross-modal tasks like molecule captioning, the evaluation metric employed by AMORE is found to correlate strongly with traditional text generation metrics such as ROUGE and METEOR, suggesting broader applicability. The findings indicate that many ChemLMs fail to encode chemically invariant representations and instead overfit to the syntactic properties of SMILES. The AMORE framework provides a scalable, label-free means to systematically expose and quantify such vulnerabilities.",
    "author": "Ganeeva, Veronika; Sakhovskiy, Andrey; Khrabrov, Kuzma; Savchenko, Andrey; Kadurin, Artur; Tutubalina, Elena",
    "doi": "10.18653/v1/2024.findings-emnlp.760",
    "id": "Ganeeva-2024-AMORE",
    "journal": "Findings of the Association for Computational Linguistics: EMNLP 2024",
    "keywords": "chemical language models, SMILES, AMORE, molecule representation, augmentation, zero-shot evaluation",
    "title": "Lost in translation: Chemical language models and the misunderstanding of molecule structures",
    "type": "article",
    "url": "https://aclanthology.org/2024.findings-emnlp.760.pdf",
    "year": 2024
  },
  {
    "abstract": "Artificial intelligence, and especially deep neural networks, have evolved substantially in the recent years, infiltrating numerous domains of applications. This paper provides a thorough overview of the methods developed to explain deep neural networks, introduces a classification of existing approaches, and discusses the realization of these methods on hardware to enhance understanding and trust.",
    "author": "Antamis, Thanasis; Drosou, Anastasis; Vafeiadis, Thanasis; Nizamis, Alexandros; Ioannidis, Dimosthenis; Tzovaras, Dimitrios",
    "doi": "10.1016/j.neucom.2024.128204",
    "id": "Antamis-2024-XDNNReview",
    "journal": "Neurocomputing",
    "keywords": "XAI; Deep neural networks; xDNN; Survey",
    "pages": "128204",
    "publisher": "Elsevier B.V.",
    "title": "Interpretability of deep neural networks: A review of methods, classification and hardware",
    "type": "article",
    "url": "https://doi.org/10.1016/j.neucom.2024.128204",
    "volume": "601",
    "year": 2024
  },
  {
    "abstract": "We review distributionally robust optimization (DRO), a principled approach for constructing statistical estimators that hedge against the impact of deviations in the expected loss between the training and deployment environments. Many well-known estimators in statistics and machine learning (e.g. AdaBoost, LASSO, ridge regression, dropout training, etc.) are distributionally robust in a precise sense. We hope that by discussing the DRO interpretation of well-known estimators, statisticians who may not be too familiar with DRO may find a way to access the DRO literature through the bridge between classical results and their DRO equivalent formulation. On the other hand, the topic of robustness in statistics has a rich tradition associated with removing the impact of contamination. Thus, another objective of this paper is to clarify the difference between DRO and classical statistical robustness. As we will see, these are two fundamentally different philosophies leading to completely different types of estimators. In DRO, the statistician hedges against an environment shift that occurs after the decision is made; thus DRO estimators tend to be pessimistic in an adversarial setting, leading to a min-max type formulation. In classical robust statistics, the statistician seeks to correct contamination that occurred before a decision is made; thus robust statistical estimators tend to be optimistic leading to a min-min type formulation.",
    "author": "Blanchet, Jose; Li, Jiajin; Lin, Sirui; Zhang, Xuhui",
    "doi": "10.48550/arXiv.2401.14655",
    "id": "Blanchet-2024-DROStats",
    "journal": "arXiv preprint",
    "note": "Preprint; accepted to Statistical Science",
    "title": "Distributionally robust optimization and robust statistics",
    "type": "article",
    "url": "https://arxiv.org/abs/2401.14655",
    "year": 2024
  },
  {
    "abstract": "A multimodal framework predicts molecular properties using SMILES and 3D conformers. Contrastive learning aligns representations via InfoNCE loss, supported by ConR for regression, SupCon for classification, and feature distribution smoothing (FDS). Outperforms state-of-the-art on diverse tasks like SARS-CoV-2 docking, MoleculeNet, and kinase inhibition.",
    "author": "Nguyen, Long D.; Nguyen, Quang H.; Trinh, Quang H.; Nguyen, Binh P.",
    "doi": "10.1021/acs.jcim.4c01240",
    "id": "Nguyen-2024-MultimodalSMILES",
    "journal": "Journal of Chemical Information and Modeling",
    "keywords": "SMILES, molecular property prediction, 3D conformers, contrastive learning, ConR, SupCon, FDS",
    "number": "24",
    "pages": "9173–9195",
    "title": "From SMILES to enhanced molecular property prediction: A unified multimodal framework with predicted 3D conformers and contrastive learning techniques",
    "type": "article",
    "url": "https://doi.org/10.1021/acs.jcim.4c01240",
    "volume": "64",
    "year": 2024
  },
  {
    "abstract": "This study proposes a self-supervised learning framework that integrates SMILES and graph modalities for molecular property prediction. A unified Transformer backbone processes both tokenized modalities using a masked reconstruction strategy, enhanced by a novel non-overlapping masking technique. The approach shows state-of-the-art performance across molecular tasks and confirms the complementarity of multi-modal features through ablation studies.",
    "author": "Shen, Ao; Yuan, Mingzhi; Ma, Yingfan; Du, Jie; Wang, Manning",
    "doi": "10.1093/bib/bbae256",
    "id": "Shen-2024-CMMSL",
    "journal": "Briefings in Bioinformatics",
    "number": "4",
    "pages": "bbae256",
    "title": "Complementary multi-modality molecular self-supervised learning via non-overlapping masking for property prediction",
    "type": "article",
    "url": "https://doi.org/10.1093/bib/bbae256",
    "volume": "25",
    "year": 2024
  },
  {
    "abstract": "Moleco is a contrastive learning framework designed to enhance chemical language models (CLMs) by incorporating structural information derived from molecular fingerprints. By training CLMs to distinguish structurally similar and dissimilar molecules based on fingerprint similarities, Moleco improves the models' understanding of molecular structures, leading to enhanced performance in molecular property prediction tasks.",
    "author": "Park, Jun-Hyung; Park, Hyuntae; Kim, Yeachan; Lim, Woosang; Lee, SangKeun",
    "booktitle": "Proceedings of the 2024 Conference on Empirical Methods in Natural Language Processing: Industry Track",
    "doi": "10.18653/v1/2024.emnlp-industry.30",
    "id": "Park-2024-Moleco",
    "keywords": "Moleco, contrastive learning, chemical language models, molecular fingerprints, molecular property prediction",
    "pages": "408–420",
    "publisher": "Association for Computational Linguistics",
    "title": "Moleco: Molecular contrastive learning with chemical language models for molecular property prediction",
    "type": "inproceedings",
    "url": "https://aclanthology.org/2024.emnlp-industry.30",
    "year": 2024
  },
  {
    "abstract": "CONSMI is a contrastive learning framework for SMILES-based molecular generation. It uses multiple SMILES representations of the same molecule as positive pairs to learn comprehensive molecular encodings. Experimental results show improved novelty and validity in generated molecules, as well as favorable performance in classification tasks like compound–protein interaction.",
    "author": "Qian, Ying; Shi, Minghua; Zhang, Qian",
    "doi": "10.3390/molecules29020495",
    "id": "Qian-2024-CONSMI",
    "journal": "Molecules",
    "keywords": "CONSMI, SMILES, contrastive learning, molecular generation, compound–protein interaction",
    "number": "2",
    "pages": "495",
    "title": "CONSMI: Contrastive learning of SMILES representations for improved molecular generation",
    "type": "article",
    "url": "https://doi.org/10.3390/molecules29020495",
    "volume": "29",
    "year": 2024
  },
  {
    "abstract": "SALSA is a SMILES-based transformer autoencoder for molecular representation learning, enhanced with supervised contrastive learning. It aligns structurally similar molecules in latent space by incorporating graph-level semantic similarities. Evaluations show improved structural, physicochemical, and biological awareness compared to standard autoencoders.",
    "author": "Kirchoff, Kathryn E.; Maxfield, Travis; Tropsha, Alexander; Gomez, Shawn M.",
    "doi": "10.1609/aaai.v38i12.29221",
    "id": "Kirchoff-2024-SALSA",
    "journal": "Proceedings of the AAAI Conference on Artificial Intelligence",
    "keywords": "molecular representation, SMILES, contrastive learning, transformer autoencoder, graph similarity",
    "number": "12",
    "pages": "13211–13219",
    "title": "SALSA: Semantically-aware latent space autoencoder",
    "type": "article",
    "url": "https://ojs.aaai.org/index.php/AAAI/article/view/29221",
    "volume": "38",
    "year": 2024
  },
  {
    "abstract": "This paper proposes a contrastive learning method for SMILES representations to improve molecular property prediction. By treating SMILES permutations of the same molecule as positive pairs and different molecules as negative pairs, the model learns invariant molecular features. Experiments on multiple datasets show improved predictive performance over traditional methods.",
    "author": "Quiles, Marcelo G.; Ribeiro, Pedro A. L.; Pinheiro, Gabriel A.; Prati, Ronaldo C.; Silva, João L. F. da",
    "booktitle": "Computational Science and Its Applications – ICCSA 2024 Workshops. Lecture Notes in Computer Science, vol 14823",
    "doi": "10.1007/978-3-031-65329-2_26",
    "id": "Quiles-2024-ContrastiveSMILES",
    "keywords": "SMILES, contrastive learning, molecular property prediction, self-supervised learning, chemoinformatics",
    "publisher": "Springer, Cham",
    "title": "Enhancing low-cost molecular property prediction with contrastive learning on SMILES representations",
    "type": "inproceedings",
    "url": "https://doi.org/10.1007/978-3-031-65329-2_26",
    "year": 2024
  },
  {
    "abstract": "This follow-up post discusses improvements in integer tokenization in newer LLMs (e.g., GPT-4, LLaMA 3, Mistral), where numbers are chunked consistently into digits or using right-to-left grouping. These changes significantly improve arithmetic reasoning and inference efficiency.",
    "author": "Millidge, Beren",
    "id": "Millidge-2024-IntTokFixed",
    "journal": "beren.io",
    "keywords": "tokenization, integers, GPT-4, LLaMA 3, Mistral, arithmetic",
    "title": "Integer tokenization is now much less insane",
    "type": "article",
    "url": "https://www.beren.io/2024-05-11-Integer-tokenization-is-now-much-less-insane/",
    "year": 2024
  },
  {
    "abstract": "NGG is a graph generation method using conditioned latent diffusion in the vector space of a variational graph autoencoder. It enables generation of graphs with target properties by guiding the diffusion with statistical feature vectors. NGG outperforms prior approaches and LLMs on multiple tasks in terms of control and efficiency.",
    "author": "Evdaimon, Iakovos; Nikolentzos, Giannis; Xypolopoulos, Christos; Kammoun, Ahmed; Chatzianastasis, Michail; Abdine, Hadi; Vazirgiannis, Michalis",
    "doi": "10.48550/arXiv.2403.01535",
    "id": "Evdaimon-2024-NGG",
    "journal": "arXiv",
    "keywords": "graph generation, latent diffusion, variational autoencoder, conditional generation, neural networks",
    "title": "Neural Graph Generator: Feature-Conditioned Graph Generation using Latent Diffusion Models",
    "type": "article",
    "url": "https://arxiv.org/abs/2403.01535",
    "year": 2024
  },
  {
    "abstract": "Twigs is a score-based diffusion framework for conditional graph generation. It uses a trunk diffusion process for primary variables and stem processes for dependent variables. Loop guidance coordinates these flows during sampling to model complex interactions. Experiments show strong performance on tasks like inverse molecular design and molecular optimization.",
    "author": "Mercatali, Giangiacomo; Verma, Yogesh; Freitas, Andre; Garg, Vikas",
    "doi": "10.48550/arXiv.2410.24012",
    "id": "Mercatali-2024-Twigs",
    "journal": "arXiv",
    "keywords": "graph generation, diffusion model, loop guidance, conditional generation, molecular optimization",
    "title": "Diffusion Twigs with Loop Guidance for Conditional Graph Generation",
    "type": "article",
    "url": "https://arxiv.org/abs/2410.24012",
    "year": 2024
  },
  {
    "abstract": "Graph DiT is a diffusion-based model for multi-conditional molecular generation. It combines a property encoder with a Transformer-based denoiser and introduces a graph-dependent noise model. The model outperforms baselines across nine evaluation metrics and proves effective in polymer design for gas separation.",
    "author": "Liu, Gang; Xu, Jiaxin; Luo, Tengfei; Jiang, Meng",
    "booktitle": "",
    "doi": "",
    "id": "Liu-2024-GraphDiT",
    "journal": "Advances in Neural Information Processing Systems",
    "keywords": "graph diffusion, multi-conditional generation, transformer, molecular design, inverse design",
    "note": "Peer-reviewed NeurIPS 2024 conference paper",
    "number": "",
    "pages": "8065–8092",
    "publisher": "NeurIPS",
    "title": "Graph diffusion transformers for multi-conditional molecular generation",
    "type": "article",
    "url": "https://proceedings.neurips.cc/paper_files/paper/2024/hash/0f6931a9e339a012a9909306d7c758b4-Abstract-Conference.html",
    "volume": "37",
    "year": 2024
  },
  {
    "abstract": "MUDiff is a generative model that jointly generates atom features, 2D molecular graphs, and 3D coordinates using discrete and continuous diffusion. It employs a 3D equivariant graph transformer to denoise both types of representations. The model produces stable, valid, and diverse molecules and outperforms prior approaches on molecular generation tasks.",
    "author": "Hua, Chenqing; Luan, Sitao; Xu, Minkai; Ying, Zhitao; Fu, Jie; Ermon, Stefano; Precup, Doina",
    "id": "Hua-2024-MUDiff",
    "journal": "Proceedings of the Second Learning on Graphs Conference",
    "keywords": "molecule generation, diffusion model, 2D-3D representation, equivariant transformer",
    "pages": "33:1–33:26",
    "publisher": "PMLR",
    "title": "MUDiff: Unified Diffusion for Complete Molecule Generation",
    "type": "inproceedings",
    "url": "https://proceedings.mlr.press/v231/hua24a.html",
    "volume": "231",
    "year": 2024
  },
  {
    "abstract": "This study benchmarks six expressive GNNs within generative frameworks like GCPN, GraphAF, and GraphEBM on molecular generation tasks. Results on ZINC-250k show that advanced GNNs can improve performance, but GNN expressiveness is not strictly necessary. Edge feature extraction plays a critical role in achieving state-of-the-art results on key molecular design metrics.",
    "author": "Zou, Xiandong; Zhao, Xiangyu; Liò, Pietro; Zhao, Yiren",
    "id": "Zou-2024-ExpressiveGNN",
    "journal": "Proceedings of the Second Learning on Graphs Conference",
    "keywords": "graph generation, expressive GNNs, molecular design, generative modeling",
    "pages": "21:1–21:26",
    "publisher": "PMLR",
    "title": "Will More Expressive Graph Neural Networks Do Better on Generative Tasks?",
    "type": "inproceedings",
    "url": "https://proceedings.mlr.press/v231/zou24a.html",
    "volume": "231",
    "year": 2024
  },
  {
    "abstract": "GSHOT is a meta-learning framework for labeled graph generation under data scarcity. It transfers knowledge from auxiliary datasets and fine-tunes on new tasks with limited samples. GSHOT achieves higher fidelity than existing methods across diverse benchmarks.",
    "author": "Manchanda, Sahil; Gupta, Shubham; Ranu, Sayan; Bedathur, Srikanta J.",
    "id": "Manchanda-2024-GSHOT",
    "journal": "Proceedings of the Second Learning on Graphs Conference",
    "keywords": "graph generation, few-shot learning, meta-learning, generative models",
    "pages": "32:1–32:18",
    "publisher": "PMLR",
    "title": "Generative Modeling of Labeled Graphs Under Data Scarcity",
    "type": "inproceedings",
    "url": "https://proceedings.mlr.press/v231/manchanda24a.html",
    "volume": "231",
    "year": 2024
  },
  {
    "abstract": "Proteins are complex biomolecules that perform a variety of crucial functions within living organisms. Designing and generating novel proteins can pave the way for many future synthetic biology applications, including drug discovery. However, it remains a challenging computational task due to the large modeling space of protein structures. In this study, we propose a latent diffusion model that can reduce the complexity of protein modeling while flexibly capturing the distribution of natural protein structures in a condensed latent space. Specifically, we propose an equivariant protein autoencoder that embeds proteins into a latent space and then uses an equivariant diffusion model to learn the distribution of the latent protein representations. Experimental results demonstrate that our method can effectively generate novel protein backbone structures with high designability and efficiency.",
    "author": "Fu, Cong; Yan, Keqiang; Wang, Limei; Au, Wing Yee; McThrow, Michael Curtis; Komikado, Tao; Maruhashi, Koji; Uchino, Kanji; Qian, Xiaoning; Ji, Shuiwang",
    "id": "Fu-2024-LatentDiff",
    "journal": "Proceedings of the Second Learning on Graphs Conference",
    "keywords": "protein structure generation, latent diffusion, equivariant models, generative modeling",
    "pages": "29:1–29:17",
    "publisher": "PMLR",
    "title": "A Latent Diffusion Model for Protein Structure Generation",
    "type": "inproceedings",
    "url": "https://proceedings.mlr.press/v231/fu24a.html",
    "volume": "231",
    "year": 2024
  },
  {
    "abstract": "MIST-CF is a data-driven framework for predicting chemical formulas from MS/MS spectra without requiring fragmentation tree construction. Built on a Formula Transformer architecture, it ranks candidate formula–adduct pairs using energy-based modeling. Trained on a large open-access dataset, MIST-CF improves top-1 accuracy by 10% over other neural networks and performs competitively on the CASMI2022 benchmark—matching the best entry in positive mode, without manual tuning. This demonstrates a scalable, accurate method for MS1 formula inference directly from MS2 fragments.",
    "author": "Goldman, Samuel; Xin, Jiayi; Provenzano, Joules; Coley, Connor W.",
    "doi": "10.1021/acs.jcim.3c01082",
    "id": "Goldman-2024-MISTCF",
    "journal": "Journal of Chemical Information and Modeling",
    "pages": "2421–2431",
    "title": "MIST-CF: Chemical formula inference from tandem mass spectra",
    "type": "article",
    "url": "https://doi.org/10.1021/acs.jcim.3c01082",
    "year": 2024
  },
  {
    "author": "Hugging~Face",
    "id": "HuggingFace-2024-GPT2Doc",
    "note": "Official documentation for GPT-2 model usage in Hugging Face Transformers.",
    "publisher": "Hugging Face",
    "title": "GPT-2 — Hugging Face Transformers documentation",
    "type": "online",
    "url": "https://huggingface.co/docs/transformers/en/model_doc/gpt2",
    "year": 2024
  },
  {
    "author": "{Mestrelab Research}",
    "id": "Mestrelab-2024-MestReNovaManual",
    "keywords": "NMR software; spectroscopy; data analysis; MestReNova",
    "note": "Version 15.1 released November 5, 2024. Includes new tools such as 13C/HSQC search, MCR-ALS for peak purity, Screen plugin 2.0, Python integration, JSON export, and Gearbox support.",
    "publisher": "Mestrelab Research",
    "title": "MestReNova 15.1 Manual",
    "type": "book",
    "url": "https://mestrelab.com/downloads/mnova/manuals/MestReNova-15.1_Manual.pdf",
    "year": 2024
  },
  {
    "abstract": "The FARM project introduces a novel approach to molecular representation by incorporating functional group (FG) information directly into molecular representations. This FG-aware tokenization enhances the model's understanding of chemical language, leading to improved predictions of molecular properties. The approach includes a rule-based FG detection algorithm, FG-enhanced SMILES notation, and a dual-view molecular representation combining atom-level embeddings with structural learning. Evaluations on the MoleculeNet dataset demonstrate FARM's state-of-the-art performance in molecular property prediction tasks.",
    "author": "Nguyen, Thao; Huang, Kuan-Hao; Liu, Ge; Burke, Martin D.; Diao, Ying; Ji, Heng",
    "doi": "arXiv:2410.02082",
    "id": "Nguyen-2024-FARM",
    "journal": "arXiv",
    "keywords": "Molecular Representation Learning, Functional Groups, SMILES Notation, Drug Discovery, Deep Learning",
    "title": "FARM: Functional Group-Aware Representations for Small Molecules",
    "type": "article",
    "url": "https://thaonguyen217.github.io/farm/",
    "year": 2024
  },
  {
    "abstract": "Self-supervised molecular representation learning has demonstrated great promise in bridging machine learning and chemical science to accelerate the development of new drugs. Due to the limited reaction data, existing methods are mostly pretrained by augmenting the intrinsic topology of molecules without effectively incorporating chemical reaction prior information, which makes them difficult to generalize to chemical reaction-related tasks. To address this issue, we propose ReaKE, a reaction knowledge embedding framework, which formulates chemical reactions as a knowledge graph. Specifically, we constructed a chemical synthesis knowledge graph with reactants and products as nodes and reaction rules as the edges. Based on the knowledge graph, we further proposed novel contrastive learning at both molecule and reaction levels to capture the reaction-related functional group information within and between molecules. Extensive experiments demonstrate the effectiveness of ReaKE compared with state-of-the-art methods on several downstream tasks, including reaction classification, product prediction, and yield prediction.",
    "author": "Xie, Jiancong; Wang, Yi; Rao, Jiahua; Zheng, Shuangjia; Yang, Yuedong",
    "doi": "10.1021/acs.jcim.4c00157",
    "id": "Xie-2024-ReaKE",
    "journal": "Journal of Chemical Information and Modeling",
    "number": "6",
    "pages": "1945–1954",
    "title": "Self-Supervised Contrastive Molecular Representation Learning with a Chemical Synthesis Knowledge Graph",
    "type": "article",
    "url": "https://doi.org/10.1021/acs.jcim.4c00157",
    "volume": "64",
    "year": 2024
  },
  {
    "abstract": "This study introduces MVGC, a multi-view molecular pre-training framework that integrates three fundamental molecular representations to improve molecular property prediction. By reducing information loss and leveraging generative contrastive learning, MVGC enhances molecular representation learning. Evaluations on seven classification and three regression tasks show MVGC’s superior performance over state-of-the-art methods, with potential applications in chemical significance learning.",
    "author": "Liu, Yunwu; Zhang, Ruisheng; Yuan, Yongna; Ma, Jun; Li, Tongfeng; Yu, Zhixuan",
    "doi": "10.1007/s12539-024-00632-z",
    "id": "Liu-2024-MVGC",
    "journal": "Interdisciplinary Sciences: Computational Life Sciences",
    "keywords": "MVGC, Molecular Representation Learning, Generative Contrastive Learning, Multi-View Learning, Machine Learning, Cheminformatics",
    "pages": "741–754",
    "title": "A Multi-view Molecular Pre-training with Generative Contrastive Learning",
    "type": "article",
    "url": "https://doi.org/10.1007/s12539-024-00632-z",
    "volume": "16",
    "year": 2024
  },
  {
    "abstract": "This study explores the integration of large language models (LLMs) with molecular translation, addressing training efficacy and out-of-distribution challenges. A novel contrastive preference optimisation method is introduced to improve machine language-molecule translation, ensuring more accurate outputs. Experimental results using only 10% of the dataset show a 32% improvement over existing models. Additionally, a domain-agnostic evaluation method is proposed to assess hallucination in LLMs and promote responsible AI use in chemistry.",
    "author": "Gkoumas, Dimitris",
    "doi": "10.18653/v1/2024.langmol-1.3",
    "id": "Gkoumas-2024-ALMol",
    "journal": "Proceedings of the 1st Workshop on Language + Molecules (L+M 2024)",
    "keywords": "ALMol, Language-Molecule Translation, Large Language Models, Contrastive Preference Optimisation, Cheminformatics, AI in Chemistry",
    "pages": "21–27",
    "title": "ALMol: Aligned Language-Molecule Translation LLMs through Offline Preference Contrastive Optimisation",
    "type": "conference",
    "url": "https://aclanthology.org/2024.langmol-1.3/",
    "volume": "",
    "year": 2024
  },
  {
    "abstract": "Transformer-based machine learning models have significantly impacted cheminformatics by improving tasks such as property prediction and molecular generation. This review discusses the strengths and limitations of different chemical representations used with transformers, highlights recent innovations in adapting transformers for chemical data, and outlines future research directions in applying these models to chemical problems.",
    "author": "Luong, Kha-Dinh; Singh, Ambuj",
    "doi": "10.1021/acs.jcim.3c02070",
    "id": "Luong-2024-TransformersCheminformatics",
    "journal": "Journal of Chemical Information and Modeling",
    "keywords": "Transformers, Cheminformatics, Machine Learning, Property Prediction, Molecular Generation, Deep Learning",
    "number": "11",
    "pages": "4392–4409",
    "title": "Application of Transformers in Cheminformatics",
    "type": "article",
    "url": "https://doi.org/10.1021/acs.jcim.3c02070",
    "volume": "64",
    "year": 2024
  },
  {
    "abstract": "This study explores the challenges of accurately identifying stereoisomers in cheminformatics, particularly for predicting association constants between cyclodextrin and guest molecules using machine learning. Traditional molecular descriptors, including Isomeric SMILES, fail to capture stereochemical configurations effectively. Word embedding techniques such as Mol2Vec convert molecular representations but do not distinguish stereoisomers. This study proposes novel approaches integrating stereochemical information into word embeddings or treating Isomeric SMILES as text in NLP frameworks, improving stereoisomer discrimination in cheminformatics applications.",
    "author": "Tahıl, Gökhan; Delorme, Fabien; Le Berre, Daniel; Monflier, Éric; Sayede, Adlane; Tilloy, Sébastien",
    "doi": "10.1021/acs.jcim.4c00318",
    "id": "Tahil-2024-StereoisomersML",
    "journal": "Journal of Chemical Information and Modeling",
    "keywords": "Stereoisomers, Machine Learning, Cheminformatics, Isomeric SMILES, Mol2Vec, Word Embedding, Cyclodextrin",
    "number": "14",
    "pages": "5451–5469",
    "title": "Stereoisomers Are Not Machine Learning’s Best Friends",
    "type": "article",
    "url": "https://doi.org/10.1021/acs.jcim.4c00318",
    "volume": "64",
    "year": 2024
  },
  {
    "abstract": "Chemical SuperLearner (ChemSL) is an automated framework for building interpretable machine learning models for molecular property prediction. By integrating molecular representations such as Morgan fingerprints, Mol2Vec, and molecular descriptors, ChemSL employs a stacked ensemble model combining 40 base learners. Benchmarking on MoleculeNet datasets (ESOL, FreeSolv, Lipophilicity) demonstrates ChemSL’s superior predictive performance. Its applicability is further validated using the Yield Sooting Index (YSI) database, highlighting its potential in cheminformatics, materials science, drug discovery, and fuel design.",
    "author": "Mohan, Balaji; Chang, Junseok",
    "doi": "10.1016/j.ces.2024.120111",
    "id": "Mohan-2024-ChemSL",
    "journal": "Chemical Engineering Science",
    "keywords": "Chemical SuperLearner, ChemSL, Machine Learning, Molecular Properties, Ensemble Learning, Cheminformatics, Drug Discovery",
    "pages": "120111",
    "title": "Chemical SuperLearner (ChemSL) - An automated machine learning framework for building physical and chemical properties model",
    "type": "article",
    "url": "https://doi.org/10.1016/j.ces.2024.120111",
    "volume": "294",
    "year": 2024
  },
  {
    "abstract": "Molecular representations are crucial in bio-cheminformatics, enabling machine learning applications in drug discovery, molecular property prediction, and chemical reactions. This review categorizes and evaluates widely used molecular representations, highlighting their strengths, limitations, and applicability, including macromolecule-specific representations. The study provides insights into their role in advancing cheminformatics and related fields.",
    "author": "Nguyen-Vo, Thanh-Hoang; Teesdale-Spittle, Paul; Harvey, Joanne E.; Nguyen, Binh P.",
    "doi": "10.1007/s12293-024-00414-6",
    "id": "NguyenVo-2024-MolecularRepresentations",
    "journal": "Memetic Computing",
    "keywords": "Molecular Representations, Bio-Cheminformatics, Drug Discovery, Machine Learning, Macromolecules, Chemical Reactions",
    "pages": "519–536",
    "title": "Molecular representations in bio-cheminformatics",
    "type": "article",
    "url": "https://doi.org/10.1007/s12293-024-00414-6",
    "volume": "16",
    "year": 2024
  },
  {
    "abstract": "Large Language Models (LLMs) have demonstrated strong performance in few-shot In-Context Learning (ICL) for chemical design. This study develops a semi-supervised method to enhance many-shot ICL in molecular inverse design and lead optimization. The approach iteratively integrates LLM-generated molecules with high predicted performance alongside experimental data. Additionally, a multi-modal LLM enables interactive molecular modification via text instructions. The method significantly improves molecular design ICL while maintaining accessibility for scientists.",
    "author": "Moayedpour, Saeed; Corrochano-Navarro, Alejandro; Sahneh, Faryad; Koetter, Alexander; Vymetal, Jiří; Kogler-Anele, Lorenzo; Mas, Pablo; Jangjou, Yasser; Li, Sizhen; Bailey, Michael; Bianciotto, Marc; Matter, Hans; Grebner, Christoph; Hessler, Gerhard; Bar-Joseph, Ziv; Jager, Sven",
    "id": "Moayedpour-2024-ICLMolecularDesign",
    "journal": "Proceedings of the 41st International Conference on Machine Learning",
    "keywords": "In-Context Learning, Molecular Inverse Design, Large Language Models, Lead Optimization, Multi-Modal LLMs, AI in Drug Discovery",
    "pages": "",
    "title": "Many-Shot In-Context Learning for Molecular Inverse Design",
    "type": "conference",
    "url": "https://proceedings.mlr.press/v235/",
    "volume": "235",
    "year": 2024
  },
  {
    "abstract": "Dr.Emb Appyter is a web-based platform for chemical compound search in drug discovery, leveraging embedding vectors to identify structurally and functionally similar compounds. It integrates multiple embedding methods, including fingerprinting, SMILES, and transcriptional response-based approaches. Using a Faiss-based search system, it efficiently retrieves closest compounds in the database. Dr.Emb Appyter also offers 3D visualization, heatmaps, and enrichment analysis. The platform is freely available at https://dremb.korea.ac.kr.",
    "author": "Kim, Songhyeon; Bong, Hyunsu; Jeon, Minji",
    "doi": "10.1002/jcc.27469",
    "id": "Kim-2024-DrEmbAppyter",
    "journal": "Journal of Computational Chemistry",
    "keywords": "Dr.Emb Appyter, Drug Discovery, Embedding Vectors, Deep Learning, Computational Chemistry, Virtual Screening",
    "number": "31",
    "pages": "2659-2665",
    "title": "Dr.Emb Appyter: A web platform for drug discovery using embedding vectors",
    "type": "article",
    "url": "https://doi.org/10.1002/jcc.27469",
    "volume": "45",
    "year": 2024
  },
  {
    "abstract": "Artificial intelligence (AI) has revolutionized biomedical research and drug development, with transformative potential in small molecules, RNA, and antibody-based therapies. This review analyzes AI's role in industrial drug development, examining clinical trials and industry adoption. A key challenge remains the lack of AI-designed drugs receiving approval. The paper advocates for large language models and diffusion models to address this gap, highlighting AI’s opportunities and challenges in modern pharmaceutical research.",
    "author": "Zhang, Yilun; Mastouri, Mohamed; Zhang, Yang",
    "doi": "10.1016/j.medj.2024.07.026",
    "id": "Zhang-2024-AIinDrugDiscovery",
    "journal": "Med",
    "keywords": "Artificial Intelligence, Drug Discovery, Clinical Trials, Machine Learning, Pharmaceutical Industry",
    "number": "9",
    "pages": "1050-1070",
    "title": "Accelerating drug discovery, development, and clinical trials by artificial intelligence",
    "type": "article",
    "url": "https://doi.org/10.1016/j.medj.2024.07.026",
    "volume": "5",
    "year": 2024
  },
  {
    "abstract": "Deep learning techniques have transformed molecular design by efficiently exploring chemical space. This systematic review examines strategies for molecule generation using chemical language models, analyzing 62 studies on Transformers, RNNs, GANs, VAEs, and S4 models. Key themes include molecular representation, dataset size, transfer learning, reinforcement learning, and model evaluation. The study identifies trends, advantages, and challenges in chemical language modeling over the past four years.",
    "author": "Flores-Hernandez, H.; Martinez-Ledesma, E.",
    "doi": "10.1186/s13321-024-00916-y",
    "id": "FloresHernandez-2024-DLCLM",
    "journal": "Journal of Cheminformatics",
    "keywords": "Chemical Language Models, Deep Learning, Molecule Generation, Transformers, RNNs, GANs, VAEs, Reinforcement Learning",
    "number": "129",
    "title": "A systematic review of deep learning chemical language models in recent era",
    "type": "article",
    "url": "https://doi.org/10.1186/s13321-024-00916-y",
    "volume": "16",
    "year": 2024
  },
  {
    "abstract": "Generative machine learning models have successfully designed drug-like molecules, leveraging SMILES as a sequence-based representation. While large chemistry models (LCMs) have been pre-trained, their role in drug discovery remains largely unexplored. This study models drug design as a causal language modeling task, applying reward modeling, supervised fine-tuning, and proximal policy optimization—similar to ChatGPT’s training. The model achieved 99.2% generation of molecules with pIC50 > 7 towards the amyloid precursor protein, with 100% validity and novelty. This highlights LCMs' potential in drug discovery, enabling efficient fine-tuning, reinforcement learning with human feedback, and the design of novel, non-patented alternatives.",
    "author": "Ye, Gavin",
    "doi": "10.1007/s10822-024-00559-z",
    "id": "Ye-2024-DeNovoGPT",
    "journal": "Journal of Computer-Aided Molecular Design",
    "keywords": "GPT, De Novo Drug Design, Reinforcement Learning, Molecular Generation, AI in Drug Discovery, SMILES, Large Chemistry Models",
    "number": "20",
    "title": "De novo drug design as GPT language modeling: large chemistry models with supervised and reinforcement learning",
    "type": "article",
    "url": "https://doi.org/10.1007/s10822-024-00559-z",
    "volume": "38",
    "year": 2024
  },
  {
    "abstract": "Molecular Property Prediction (MPP) is vital for drug discovery, crop protection, and environmental science. This review analyzes transformer-based models for MPP, examining data selection, model architecture, and pretraining strategies. The study highlights key challenges in benchmarking and future research directions.",
    "author": "Sultan, Afnan; Sieg, Jochen; Mathea, Miriam; Volkamer, Andrea",
    "doi": "10.1021/acs.jcim.4c00747",
    "id": "Sultan-2024-TransformersMPP",
    "journal": "Journal of Chemical Information and Modeling",
    "keywords": "Transformers, Molecular Property Prediction, Deep Learning, SMILES, Self-supervised Learning",
    "number": "16",
    "pages": "6259–6280",
    "title": "Transformers for Molecular Property Prediction: Lessons Learned from the Past Five Years",
    "type": "article",
    "url": "https://doi.org/10.1021/acs.jcim.4c00747",
    "volume": "64",
    "year": 2024
  },
  {
    "abstract": "Generative machine learning models have attracted intense interest for their ability to sample novel molecules with desired chemical or biological properties. Among these, language models trained on SMILES (Simplified Molecular-Input Line-Entry System) representations have been subject to the most extensive experimental validation and have been widely adopted. However, these models have what is perceived to be a major limitation: some fraction of the SMILES strings that they generate are invalid, meaning that they cannot be decoded to a chemical structure. This perceived shortcoming has motivated a remarkably broad spectrum of work designed to mitigate the generation of invalid SMILES or correct them post hoc. Here I provide causal evidence that the ability to produce invalid outputs is not harmful but is instead beneficial to chemical language models. I show that the generation of invalid outputs provides a self-corrective mechanism that filters low-likelihood samples from the language model output. Conversely, enforcing valid outputs produces structural biases in the generated molecules, impairing distribution learning and limiting generalization to unseen chemical space. Together, these results refute the prevailing assumption that invalid SMILES are a shortcoming of chemical language models and reframe them as a feature, not a bug.",
    "author": "Skinnider, Michael A.",
    "doi": "10.1038/s42256-024-00821-x",
    "id": "Skinnider-2024-InvalidSMILES",
    "journal": "Nature Machine Intelligence",
    "pages": "437–448",
    "publisher": "Nature Machine Intelligence",
    "title": "Invalid SMILES are beneficial rather than detrimental to chemical language models",
    "type": "article",
    "url": "https://www.nature.com/articles/s42256-024-00821-x",
    "volume": "6",
    "year": 2024
  },
  {
    "abstract": "Library matching by comparing carbon-13 nuclear magnetic resonance (13C NMR) spectra with spectral data in the library is a crucial method for compound identification. In our previous paper, we introduced a deep contrastive learning system called CReSS, which used a library that contained more structures. However, CReSS has two limitations: there were no unknown structures in the library, and a redundant library reduces the structure-elucidation accuracy. Herein, we replaced the oversize traditional libraries with focused libraries containing a small number of molecules. A previously generative model, CMGNet, was used to generate focused libraries for CReSS. The combined model achieved a Top-10 accuracy of 54.03% when tested on 6,471 13C NMR spectra. In comparison, CReSS with a random reference structure library achieved an accuracy of only 9.17%. Furthermore, to expand the advantages of the focused libraries, we proposed SAmpRNN, which is a recurrent neural network (RNN). With the large focused library amplified by SAmpRNN, the structure-identification accuracy of the model increased in 70.0% of the 30 random example cases. In general, cross-modal retrieval between 13C NMR spectra and structures based on focused libraries (CFLS) achieved high accuracy and provided more accurate candidate structures than traditional libraries for compound identification.",
    "author": "Sun, Hanyu; Xue, Xi; Liu, Xue; Hu, Hai-Yu; Deng, Yafeng; Wang, Xiaojian",
    "doi": "10.1021/acs.analchem.3c04294",
    "id": "Sun-2024-CrossModalNMR",
    "journal": "Analytical Chemistry",
    "number": "15",
    "pages": "5763–5770",
    "title": "Cross-Modal Retrieval Between 13C NMR Spectra and Structures Based on Focused Libraries",
    "type": "article",
    "url": "https://doi.org/10.1021/acs.analchem.3c04294",
    "volume": "96",
    "year": 2024
  },
  {
    "abstract": "The application of machine learning models in chemistry has made remarkable strides in recent years. While analytical chemistry has received considerable interest from machine learning practitioners, its adoption into everyday use remains limited. Among the available analytical methods, Infrared (IR) spectroscopy stands out in terms of affordability, simplicity, and accessibility. However, its use has been limited to the identification of a selected few functional groups, as most peaks lie beyond human interpretation. We present a transformer model that enables chemists to leverage the complete information contained within an IR spectrum to directly predict the molecular structure. To cover a large chemical space, we pretrain the model using 634,585 simulated IR spectra and fine-tune it on 3,453 experimental spectra. Our approach achieves a top–1 accuracy of 44.4% and top–10 accuracy of 69.8% on compounds containing 6 to 13 heavy atoms. When solely predicting scaffolds, the model accurately predicts the top–1 scaffold in 84.5% and among the top–10 in 93.0% of cases.",
    "author": "Alberts, Marvin; Laino, Teodoro; Vaucher, Alain C.",
    "doi": "10.1038/s42004-024-01341-w",
    "id": "Alberts-2024-IRStructure",
    "journal": "Communications Chemistry",
    "number": "268",
    "title": "Leveraging Infrared Spectroscopy for Automated Structure Elucidation",
    "type": "article",
    "url": "https://doi.org/10.1038/s42004-024-01341-w",
    "volume": "7",
    "year": 2024
  },
  {
    "abstract": "Machine learning (ML) has experienced significant advancements due to improvements in computational hardware and the development of new algorithms for artificial intelligence. These advancements have led to the integration of ML with mass spectrometry (MS), enhancing data analysis and interpretation. This review provides an up-to-date overview of recent developments in ML applications within MS-based techniques, offering critical insights into current challenges and potential future directions in this interdisciplinary field.",
    "author": "Beck, Armen G.; Muhoberac, Matthew; Randolph, Caitlin E.; Beveridge, Connor H.; Wijewardhane, Prageeth R.; Kenttämaa, Hilkka I.; Chopra, Gaurav",
    "doi": "10.1021/acsmeasuresciau.3c00060",
    "id": "Beck-2024-MLMS",
    "journal": "ACS Measurement Science Au",
    "number": "3",
    "pages": "233–246",
    "title": "Recent Developments in Machine Learning for Mass Spectrometry",
    "type": "article",
    "url": "https://pubs.acs.org/doi/10.1021/acsmeasuresciau.3c00060",
    "volume": "4",
    "year": 2024
  },
  {
    "abstract": "Large Language Models (LLMs) have shown significant problem-solving capabilities across predictive and generative tasks in chemistry. However, their proficiency in multi-step chemical reasoning remains underexplored. We introduce a new challenge: molecular structure elucidation, which involves deducing a molecule’s structure from various types of spectral data. Solving such a molecular puzzle, akin to solving crossword puzzles, poses reasoning challenges that require integrating clues from diverse sources and engaging in iterative hypothesis testing. To address this challenging problem with LLMs, we present MolPuzzle, a benchmark comprising 217 instances of structure elucidation, which feature over 23,000 QA samples presented in a sequential puzzle-solving process, involving three interlinked sub-tasks: molecule understanding, spectrum interpretation, and molecule construction. Our evaluation of 12 LLMs reveals that the best-performing LLM, GPT-4o, performs significantly worse than humans, with only a small portion (1.4%) of its answers exactly matching the ground truth. However, it performs nearly perfectly in the first subtask of molecule understanding, achieving accuracy close to 100%. This discrepancy highlights the potential of developing advanced LLMs with improved chemical reasoning capabilities in the other two sub-tasks. Our MolPuzzle dataset and evaluation code are available at this link: https://github.com/KehanGuo2/MolPuzzle.",
    "author": "Guo, K.; Nan, B.; Zhou, Y.; Guo, T.; Guo, Z.; Surve, M.; Liang, Z.; Chawla, N. V.; Wiest, O.; Zhang, X.",
    "booktitle": "Proceedings of the Thirty-Eighth Conference on Neural Information Processing Systems (NeurIPS) Datasets and Benchmarks Track",
    "id": "Guo-2024-MolPuzzle",
    "title": "Can LLMs Solve Molecule Puzzles? A Multimodal Benchmark for Molecular Structure Elucidation",
    "type": "conference",
    "url": "https://proceedings.neurips.cc/paper_files/paper/2024/hash/f2b9e8e7a36d43ddfd3d55113d56b1e0-Abstract-Datasets_and_Benchmarks_Track.html",
    "year": 2024
  },
  {
    "abstract": "Mass spectra, which are agglomerations of ionized fragments from targeted molecules, play a crucial role across various fields for the identification of molecular structures. A prevalent analysis method involves spectral library searches, where unknown spectra are cross-referenced with a database. The effectiveness of such search-based approaches, however, is restricted by the scope of the existing mass spectra database, underscoring the need to expand the database via mass spectra prediction. In this research, we propose the Motif-based Mass Spectrum prediction Network (MoMS-Net), a GNN-based architecture to predict the mass spectra pattern utilizing the structural motif information of the molecule. MoMS-Net considers both a molecule and its substructures as a graph form, which facilitates the incorporation of long-range dependencies while using less memory compared to the graph transformer model. We evaluated our model over various types of mass spectra and showed the validity and superiority over the conventional models.",
    "author": "Park, Jiwon; Jo, Jeonghee; Yoon, Sungroh",
    "doi": "10.1038/s41598-024-51760-x",
    "id": "Park-2024-MoMSNet",
    "journal": "Scientific Reports",
    "pages": "1400",
    "title": "Mass Spectra Prediction with Structural Motif-based Graph Neural Networks",
    "type": "article",
    "url": "https://www.nature.com/articles/s41598-024-51760-x",
    "volume": "14",
    "year": 2024
  },
  {
    "abstract": "The rapid proliferation of new psychoactive substances (NPS) poses significant challenges to conventional mass-spectrometry-based identification methods due to the absence of reference spectra for these emerging substances. This paper introduces PS²MS, an AI-powered predictive system designed specifically to address the limitations of identifying the emergence of unidentified novel illicit drugs. PS²MS builds a synthetic NPS database by enumerating feasible derivatives of known substances and uses deep learning to generate mass spectra and chemical fingerprints. When the mass spectrum of an analyte does not match any known reference, PS²MS simultaneously examines the chemical fingerprint and mass spectrum against the putative NPS database using integrated metrics to deduce possible identities. Experimental results affirm the effectiveness of PS²MS in identifying cathinone derivatives within real evidence specimens, signifying its potential for practical use in identifying emerging drugs of abuse for researchers and forensic experts.",
    "author": "Lin, Yi-Ching; Chien, Wei-Chen; Wang, Yu-Xuan; Wang, Ying-Hau; Yang, Feng-Shuo; Tseng, Li-Ping; Hung, Jui-Hung",
    "doi": "10.1021/acs.analchem.3c05019",
    "id": "Hung-2024-PS2MS",
    "journal": "Analytical Chemistry",
    "note": "Open Access, published on 15 March 2024",
    "number": "12",
    "pages": "",
    "title": "PS²MS: A Deep Learning-Based Prediction System for Identifying New Psychoactive Substances Using Mass Spectrometry",
    "type": "article",
    "url": "https://doi.org/10.1021/acs.analchem.3c05019",
    "volume": "96",
    "year": 2024
  },
  {
    "abstract": "Structural annotation of small molecules in tandem mass spectrometry has always been a central challenge in mass spectrometry analysis, especially using a miniaturized mass spectrometer for on-site testing. Here, we propose the Transformer enabled Fragment Tree (TeFT) method, which combines various types of fragmentation tree models and a deep learning Transformer module. It is aimed to generate the specific structure of molecules de novo solely from mass spectrometry spectra. The evaluation results on different open-source databases indicated that the proposed model achieved remarkable results in that the majority of molecular structures of compounds in the test can be successfully recognized. Also, the TeFT has been validated on a miniaturized mass spectrometer with low-resolution spectra for 16 flavonoid alcohols, achieving complete structure prediction for 8 substances. Finally, TeFT confirmed the structure of the compound contained in a Chinese medicine substance called the Anweiyang capsule. These results indicate that the TeFT method is suitable for annotating fragmentation peaks with clear fragmentation rules, particularly when applied to on-site mass spectrometry with lower mass resolution.",
    "author": "Yang, Yiming; Sun, Shuang; Yang, Shuyuan; Yang, Qin; Lu, Xinqiong; Wang, Xiaohao; Yu, Quan; Huo, Xinming; Qian, Xiang",
    "doi": "PMID:38740942",
    "id": "Huo-2024-TeFT",
    "journal": "Communications Chemistry",
    "note": "Open access, published 13 May 2024",
    "pages": "Article 109",
    "publisher": "Springer Nature",
    "title": "Structural annotation of unknown molecules in a miniaturized mass spectrometer based on a transformer enabled fragment tree method",
    "type": "article",
    "url": "https://www.semanticscholar.org/paper/c9a49c1a5597eef85a7d78743a364b516aabf89a",
    "volume": "7",
    "year": 2024
  },
  {
    "abstract": "Infrared (IR) spectroscopy is an efficient method for identifying unknown chemical compounds. To accelerate IR spectrum analysis, various calculation and machine learning methods for simulating IR spectra of molecules have been studied in chemical science. However, existing calculation and machine learning methods assumed a rigid constraint that all molecules are in the gas phase, i.e., they overlooked the phase dependency of the IR spectra. In this paper, we propose an efficient phase-aware machine learning method to generate phase-conditioned IR spectra from 2D molecular structures. To this end, we devised a phase-aware graph neural network and combined it with a transformer decoder. To the best of our knowledge, the proposed method is the first IR spectrum generator that can generate the phase-conditioned IR spectra of real-world complex molecules. The proposed method outperformed state-of-the-art methods in the tasks of generating IR spectra on a benchmark dataset containing experimentally measured 11,546 IR spectra of 10,288 unique molecules. All implementations of the proposed method are publicly available at https://github.com/ngs00/PASGeN.",
    "author": "Na, Gyoung S.",
    "doi": "10.1021/acs.analchem.4c04786",
    "id": "Na-2024-PhaseInfrared",
    "journal": "Analytical Chemistry",
    "number": "49",
    "pages": "19659–19669",
    "title": "Deep Learning for Generating Phase-Conditioned Infrared Spectra",
    "type": "article",
    "url": "https://doi.org/10.1021/acs.analchem.4c04786",
    "volume": "96",
    "year": 2024
  },
  {
    "abstract": "In this article, we present a novel approach to predicting chemical structures from their infrared (IR) spectra using deep Q-learning. IR spectra measurements are widely used in chemical analysis because they provide information on the types and characteristics of chemical bonds present within compounds. However, there are currently no algorithms to predict the entire chemical structure of a broad range of compounds relying solely on IR spectra, unless there is an exact or closely matched spectrum in an existing reference spectra library. To address this, we apply double deep Q-learning for automated prediction of the entire chemical structures of organic compounds based on IR spectra. Our method builds predicted structures by starting from a single carbon atom and subsequently adding an atom and bond step-by-step by ranking the rewards of each possible addition based on Q-values. We devised new structural similarity metrics, atom bond count and substructure count metrics to achieve our goal. Compared to the commonly used structural similarity score, the Jaccard index of extended-connectivity fingerprints, the devised metrics exhibit more suitable properties for Q-learning. The deep Q-model, which uses the combination of our two proposed metrics, gives the overall best performance and can generate structures similar to the actual structures in terms of their structural features and molecular weight in most tested cases.",
    "author": "Ellis, Joshua D.; Iqbal, Razib; Yoshimatsu, Keiichi",
    "doi": "10.1109/TAI.2023.3287947",
    "id": "Ellis-2024-DeepQLearning",
    "journal": "IEEE Transactions on Artificial Intelligence",
    "title": "Deep Q-Learning-Based Molecular Graph Generation for Chemical Structure Prediction From Infrared Spectra",
    "type": "article",
    "url": "https://doi.org/10.1109/TAI.2023.3287947",
    "year": 2024
  },
  {
    "abstract": "Molecular spectroscopy studies the interaction of molecules with electromagnetic radiation, and interpreting the resultant spectra is invaluable for deducing the molecular structures. However, predicting the molecular structure from spectroscopic data remains a challenging problem due to the complexity of molecular spectra and the high dimensionality of the search space. Here, we introduce DeepSPInN, a deep reinforcement learning-based framework for molecular structure prediction from infrared and 13C NMR spectra. Our model leverages a combination of policy-based reinforcement learning and deep neural networks to learn an optimal decision-making strategy for constructing molecular structures. We validate DeepSPInN on benchmark datasets and demonstrate its effectiveness in accurately predicting molecular structures with high efficiency. Our results suggest that deep reinforcement learning offers a promising approach for automated molecular structure elucidation and can significantly enhance the capabilities of spectroscopic analysis.",
    "author": "Devata, Sriram; Sridharan, Bhuvanesh; Mehta, Sarvesh; Pathak, Yashaswi; Laghuvarapu, Siddhartha; Varma, Girish; Priyakumar, U. Deva",
    "doi": "10.1039/D4DD00008K",
    "id": "Devata-2024-DeepSPInN",
    "journal": "Digital Discovery",
    "title": "DeepSPInN – Deep reinforcement learning for molecular structure prediction from infrared and 13C NMR spectra",
    "type": "article",
    "url": "https://doi.org/10.1039/D4DD00008K",
    "year": 2024
  },
  {
    "abstract": "Rapid determination of molecular structures can greatly accelerate workflows across many chemical disciplines. However, elucidating structure using only one-dimensional (1D) NMR spectra, the most readily accessible data, remains an extremely challenging problem because of the combinatorial explosion of the number of possible molecules as the number of constituent atoms is increased. Here, we introduce a multitask machine learning framework that predicts the molecular structure (formula and connectivity) of an unknown compound solely based on its 1D 1H and/or 13C NMR spectra. First, we show how a transformer architecture can be constructed to efficiently solve the task, traditionally performed by chemists, of assembling large numbers of molecular fragments into molecular structures. Integrating this capability with a convolutional neural network, we build an end-to-end model for predicting structure from spectra that is fast and accurate. We demonstrate the effectiveness of this framework on molecules with up to 19 heavy (non-hydrogen) atoms, a size for which there are trillions of possible structures. Without relying on any prior chemical knowledge such as the molecular formula, we show that our approach predicts the exact molecule 69.6% of the time within the first 15 predictions, reducing the search space by up to 11 orders of magnitude.",
    "author": "Hu, Frank; Chen, Michael S.; Rotskoff, Grant M.; Kanan, Matthew W.; Markland, Thomas E.",
    "doi": "10.1021/acscentsci.4c01132",
    "id": "Hu-2024-MultitaskNMR",
    "journal": "ACS Central Science",
    "number": "11",
    "pages": "2162–2170",
    "title": "Accurate and efficient structure elucidation from routine one-dimensional NMR spectra using multitask machine learning",
    "type": "article",
    "url": "https://doi.org/10.1021/acscentsci.4c01132",
    "volume": "10",
    "year": 2024
  },
  {
    "author": "Domżał, Barbara; Nawrocka, Ewa Klaudia; Gołowicz, Dariusz; Ciach, Michał Aleksander; Miasojedow, Błażej; Kazimierczuk, Krzysztof; Gambin, Anna",
    "doi": "10.1021/acs.analchem.3c03594",
    "id": "Domzal-2024-Magnetstein",
    "journal": "Analytical Chemistry",
    "number": "1",
    "pages": "188-196",
    "title": "Magnetstein: An Open-Source Tool for Quantitative NMR Mixture Analysis Robust to Low Resolution, Distorted Lineshapes, and Peak Shifts",
    "type": "article",
    "url": "https://doi.org/10.1021/acs.analchem.3c03594",
    "volume": "96",
    "year": 2024
  },
  {
    "abstract": "Spectroscopic techniques are essential tools for determining the structure of molecules. Different spectroscopic techniques, such as Nuclear magnetic resonance (NMR), Infrared spectroscopy, and Mass Spectrometry, provide insight into the molecular structure, including the presence or absence of functional groups. Chemists leverage the complementary nature of the different methods to their advantage. However, the lack of a comprehensive multimodal dataset, containing spectra from a variety of spectroscopic techniques, has limited machine-learning approaches mostly to single-modality tasks for predicting molecular structures from spectra. Here we introduce a dataset comprising simulated $^1$H-NMR, $^{13}$C-NMR, HSQC-NMR, Infrared, and Mass spectra (positive and negative ion modes) for 790k molecules extracted from chemical reactions in patent data. This dataset enables the development of foundation models for integrating information from multiple spectroscopic modalities, emulating the approach employed by human experts. Additionally, we provide benchmarks for evaluating single-modality tasks such as structure elucidation, predicting the spectra for a target molecule, and functional group predictions. This dataset has the potential automate structure elucidation, streamlining the molecular discovery pipeline from synthesis to structure determination. The dataset and code for the benchmarks can be found at https://rxn4chemistry.github.io/multimodal-spectroscopic-dataset.",
    "author": "Alberts, Marvin; Schilter, Oliver; Zipoli, Federico; Hartrampf, Nina; Laino, Teodoro",
    "booktitle": "NeurIPS 2024 Datasets and Benchmarks Track",
    "doi": "10.48550/arXiv.2407.17492",
    "id": "Alberts-2024-MultimodalSpectroscopy",
    "keywords": "Multimodal spectroscopy, NMR, IR, Mass Spectrometry, molecular structure, machine learning, dataset, structure elucidation",
    "title": "Unraveling molecular structure: A multimodal spectroscopic dataset for chemistry",
    "type": "conference",
    "url": "https://arxiv.org/abs/2407.17492",
    "year": 2024
  },
  {
    "abstract": "Nuclear magnetic resonance (NMR) spectroscopy is routinely used to study the properties of matter. Therefore, different materials can be classified according to their NMR spectra. However, the NMR spectra cannot be observed directly, and only the NMR signal, which is a sum of complex exponentials, is directly observable in practice. A popular approach to recover the spectrum is to perform harmonic retrieval, i.e., to reconstruct exactly the spectrum from the NMR signal. However, even when this approach fails, the spectrum might still be classified accurately. In this work, we model the spectrum as an atomic measure to study the performance of classifying the spectrum from the NMR signal, and to determine how it degrades in the presence of additive noise and changes in field intensity. Although we focus on NMR signals, our results are broadly applicable to sum-of-exponential signals. We show numerical results illustrating our claims.",
    "author": "Lehmann, Pedro Izquierdo; Xavier, Aline; Andia, Marcelo E.; Sing-Long, Carlos A.",
    "doi": "10.1109/ICASSP49357.2023.10094712",
    "id": "Lehmann-2024-NMRClassification",
    "journal": "Proceedings of the 2024 IEEE International Conference on Acoustics, Speech, and Signal Processing (ICASSP)",
    "pages": "9771-9775",
    "publisher": "IEEE",
    "title": "Exact Classification of NMR Spectra from NMR Signals",
    "type": "conference",
    "url": "https://ieeexplore.ieee.org/document/10446412",
    "year": 2024
  },
  {
    "abstract": "Extracting spin system parameters from 1D high resolution H-NMR spectra can be an intricate task requiring sophisticate methods. With a few exceptions methods to perform such a total line shape analysis commonly rely on local optimization techniques which for increasing complexity of the underlying spin system tend to reveal local solutions. In this work we propose a full Bayesian modeling approach based on a quantum mechanical model of the spin system. The Bayesian formalism provides a global optimization strategy which allows to efficiently include prior knowledge about the spin system or to incorporate additional constraints concerning the parameters of interest. The proposed algorithm has been tested on synthetic and real 1D H-NMR data for various spin systems with increasing complexity. The results show that the Bayesian algorithm provides accurate estimates even for complex spectra with many overlapping regions, and that it can cope with symmetry induced local minima. By providing an unbiased estimate of the model evidence the proposed algorithm furthermore offers a way to discriminate between different spin system candidates.",
    "author": "De Lorenzi, Flavio; Weinmann, Tom; Bruderer, Simon; Heitmann, Björn; Henrici, Andreas; Stingelin, Simon",
    "doi": "10.1016/j.jmr.2024.107723",
    "id": "DeLorenzi-2024-BayesianNMR",
    "journal": "Journal of Magnetic Resonance",
    "pages": "107723",
    "publisher": "Elsevier",
    "title": "Bayesian analysis of 1D 1H-NMR spectra",
    "type": "article",
    "url": "https://doi.org/10.1016/j.jmr.2024.107723",
    "volume": "364",
    "year": 2024
  },
  {
    "abstract": "High-throughput single-cell profiling provides an unprecedented ability to uncover the molecular states of millions of cells. These technologies are, however, destructive to cells and tissues, raising practical challenges when aiming to track dynamic biological processes. As the same cell cannot be observed at multiple time points, as it changes in time and space in response to a stimulus or perturbation, these large-scale measurements only produce unaligned data sets. In this Primer, we show how such challenges can be effectively addressed using the unifying framework of optimal transport theory and tackled using the many algorithms that have been proposed for the range of scenarios of key interest in computational biology. We further review recent advances integrating optimal transport and deep learning that allow forecasting heterogeneous cellular dynamics and behaviour, crucial in particular for pressing problems in personalized medicine.",
    "author": "Bunne, Charlotte; Schiebinger, Geoffrey; Krause, Andreas; Regev, Aviv; Cuturi, Marco",
    "doi": "10.1038/s43586-024-00334-2",
    "id": "Bunne-2024-OptimalTransport",
    "journal": "Nature Reviews Methods Primers",
    "keywords": "Optimal transport, Spatial omics, Single-cell RNA sequencing, Computational biology, Cell-cell communication",
    "note": "This paper discusses optimal transport applications in spatial transcriptomics and single-cell RNA sequencing.",
    "title": "Optimal transport for single-cell and spatial omics",
    "type": "article",
    "url": "https://doi.org/10.1038/s43586-024-00334-2",
    "volume": "4",
    "year": 2024
  },
  {
    "abstract": "This paper introduces Coupled Bridge Matching (BM²), a non-iterative approach for learning Schrödinger bridges using neural networks. A Schrödinger bridge establishes a dynamic transport map between two target distributions via a reference process, simultaneously solving an associated entropic optimal transport problem. The authors provide a preliminary theoretical analysis of the convergence properties of BM², supported by numerical experiments demonstrating its effectiveness.",
    "author": "Peluchetti, Stefano",
    "doi": "10.48550/arXiv.2409.09376",
    "id": "Peluchetti-2024-BM2Schrodinger",
    "journal": "arXiv",
    "title": "BM²: Coupled Schrödinger Bridge Matching",
    "type": "article",
    "url": "https://arxiv.org/abs/2409.09376",
    "year": 2024
  },
  {
    "abstract": "This paper addresses the problem of combining multiple experts' diffusion process models over a finite time horizon by minimizing the weighted Kullback-Leibler divergence. The authors establish the existence and uniqueness of the barycentre model and provide an explicit representation of the Radon-Nikodym derivative relative to the average drift model. They also incorporate agent-specific constraints, resulting in an optimal model that distorts the experts' barycentre to include these constraints. Two deep learning algorithms are proposed to determine the optimal drift of the combined model, facilitating efficient simulations. The paper concludes with an application that combines implied volatility smile models estimated from different datasets.",
    "author": "Jaimungal, Sebastian; Pesenti, Silvana M.",
    "doi": "10.48550/arXiv.2407.04860",
    "id": "Jaimungal-2024-KLBarycentre",
    "journal": "arXiv",
    "title": "Kullback-Leibler Barycentre of Stochastic Processes",
    "type": "article",
    "url": "https://arxiv.org/abs/2407.04860",
    "year": 2024
  },
  {
    "abstract": "This work connects optimal transport and variational inference, presenting a framework for sampling and generative modeling centered around divergences on path space. It introduces the Controlled Monte Carlo Diffusion (CMCD) sampler for Bayesian computation, a score-based annealing technique that adapts both forward and backward dynamics in a diffusion model. The paper clarifies the relationship between the EM-algorithm and iterative proportional fitting for Schrödinger bridges and proposes a regularized objective to improve efficiency. CMCD is grounded in the Jarzynski and Crooks identities from statistical physics and demonstrates superior performance compared to existing methods across various experiments.",
    "author": "Vargas, Francisco; Padhy, Shreyas; Blessing, Denis; Nusken, Nikolas",
    "id": "Vargas-2024-CMCD",
    "journal": "International Conference on Learning Representations (ICLR) 2024",
    "keywords": "Optimal Transport, Variational Inference, Monte Carlo Methods, Bayesian Computation",
    "title": "Transport Meets Variational Inference: Controlled Monte Carlo Diffusions",
    "type": "article",
    "url": "https://arxiv.org/abs/2307.01050",
    "year": 2024
  },
  {
    "abstract": "This study introduces a hybrid physics-informed approach for metabolic cybergenetics, integrating machine-learning surrogates with flux balance analysis to optimize metabolic control.",
    "author": "Espinel-Ríos, Sebastián; Avalos, José L.",
    "doi": "10.1021/acs.iecr.4c00001",
    "id": "EspinelRios-2024-Cybergenetics",
    "journal": "Industrial & Engineering Chemistry Research",
    "keywords": "Physics-Informed, Metabolic Cybergenetics, Machine Learning, Flux Balance Analysis",
    "number": "15",
    "pages": "6685-6700",
    "publisher": "American Chemical Society (ACS)",
    "title": "Hybrid Physics-Informed Metabolic Cybergenetics: Process Rates Augmented with Machine-Learning Surrogates Informed by Flux Balance Analysis",
    "type": "article",
    "url": "https://doi.org/10.1021/acs.iecr.4c00001",
    "volume": "63",
    "year": 2024
  },
  {
    "abstract": "This work applies Optimal Transport (OT) to learn metrics for 3D point clouds, improving performance in classification and retrieval tasks. By employing Wasserstein distance and contrastive learning, the framework enhances spatial geometry awareness and metric quality. The study introduces novel algorithms utilizing Sinkhorn approximation and evaluates on large-scale 3D datasets.",
    "author": "Katageri, Siddharth; Sarkar, Srinjay; Sharma, Charu",
    "doi": "10.1109/wacvw60836.2024.00063",
    "id": "Katageri-2024-MetricLearning3D",
    "journal": "2024 IEEE/CVF Winter Conference on Applications of Computer Vision Workshops (WACVW)",
    "keywords": "Optimal Transport; Metric Learning; 3D Point Clouds; Wasserstein Distance; Contrastive Learning",
    "number": "",
    "pages": "552–560",
    "title": "Metric Learning for 3D Point Clouds Using Optimal Transport",
    "type": "conference",
    "url": "https://doi.org/10.1109/wacvw60836.2024.00063",
    "volume": "",
    "year": 2024
  },
  {
    "abstract": "Optimal transport provides a metric which quantifies the dissimilarity between probability measures. For measures supported in discrete metric spaces, finding the optimal transport distance has cubic time complexity in the size of the space. However, measures supported on trees admit a closed-form optimal transport that can be computed in linear time. This paper aims to find an optimal tree structure for a given discrete metric space so that the tree-Wasserstein distance approximates the optimal transport distance in the original space. By casting the problem in ultrametric spaces, the authors optimize over the space of ultrametric trees via projected gradient descent, using hierarchical minimum spanning tree algorithms. Experimental results on real datasets show improved performance over previous approaches and accurate reconstruction of ground truth trees.",
    "author": "Chen, Samantha; Tabaghi, Puoya; Wang, Yusu",
    "doi": "10.1609/aaai.v38i18.30052",
    "id": "Chen-2024-UltrametricTreesOT",
    "journal": "Proceedings of the AAAI Conference on Artificial Intelligence",
    "keywords": "Optimal Transport; Metric Learning; Ultrametric Trees; Tree-Wasserstein Distance; Gradient Descent",
    "number": "18",
    "pages": "20657–20665",
    "title": "Learning Ultrametric Trees for Optimal Transport Regression",
    "type": "article",
    "url": "https://doi.org/10.1609/aaai.v38i18.30052",
    "volume": "38",
    "year": 2024
  },
  {
    "abstract": "The Light Schrödinger Bridge introduces a simplified, simulation-free approach to solving the Schrödinger Bridge problem, a probabilistic method closely related to entropic optimal transport. The authors combine parameterization of Schrödinger potentials with energy-based representations, resulting in computational efficiency and scalability for moderate-dimensional problems. This method outperforms traditional approaches by reducing complexity and generalizing well across applications in machine learning and data analysis. The proposed algorithm is validated on high-dimensional datasets, offering a robust and scalable solution for density estimation and trajectory inference.",
    "author": "Korotin, Alexander; Gushchin, Nikita; Burnaev, Evgeny",
    "doi": "10.48550/arXiv.2310.01174",
    "id": "Korotin-2024-LightSB",
    "journal": "Proceedings of the International Conference on Learning Representations (ICLR) 2024",
    "keywords": "Schrödinger Bridge; Optimal Transport; Density Estimation; Trajectory Inference; Machine Learning; Computational Efficiency",
    "number": "",
    "pages": "",
    "title": "Light Schrödinger Bridge",
    "type": "conference",
    "url": "https://arxiv.org/abs/2310.01174",
    "volume": "",
    "year": 2024
  },
  {
    "abstract": "",
    "author": "Lavenant, Hugo; Zhang, Stephen; Kim, Young-Heon; Schiebinger, Geoffrey",
    "doi": "10.1214/23-AAP1969",
    "id": "Lavenant-2024-TrajectoryInference",
    "journal": "The Annals of Applied Probability",
    "keywords": "",
    "number": "1A",
    "pages": "N/A",
    "title": "Toward a Mathematical Theory of Trajectory Inference",
    "type": "article",
    "url": "https://doi.org/10.1214/23-AAP1969",
    "volume": "34",
    "year": 2024
  },
  {
    "abstract": "Focuses on geospatial and disease dynamics models to address challenges posed by antimicrobial resistance (AMR) and COVID-19.",
    "author": "Zhao, Cheng",
    "doi": "10.3929/ethz-b-000663070",
    "id": "Zhao-2024-AMRThesis",
    "note": "Doctoral thesis supervised by Thomas Van Boeckel.",
    "publisher": "ETH Zurich",
    "title": "Geospatial and Disease Dynamics Modelling Against Infectious Diseases: Antimicrobial Resistance and COVID-19",
    "type": "thesis",
    "url": "https://doi.org/10.3929/ethz-b-000663070",
    "year": 2024
  },
  {
    "abstract": "This article outlines target-based strategies and recommendations for global sustainable access to effective antibiotics, aimed at ensuring measurable progress during the 2024 UN General Assembly.",
    "author": "Mendelson, Marc; Lewnard, Joseph A.; Sharland, Mike; Cook, Aislinn; Pouwels, Koen B.; Alimi, Yewande; Mpundu, Mirfin; Wesangula, Evelyn; Weese, Jeffrey Scott; Røttingen, John-Arne; Laxminarayan, Ramanan",
    "doi": "10.1016/S0140-6736(24)01019-5",
    "id": "Mendelson-2024-AMRGeneralAssembly",
    "journal": "The Lancet",
    "number": "10443",
    "pages": "2551-2564",
    "publisher": "Elsevier BV",
    "title": "Ensuring progress on sustainable access to effective antibiotics at the 2024 UN General Assembly: a target-based approach",
    "type": "article",
    "url": "https://doi.org/10.1016/S0140-6736(24)01019-5",
    "volume": "403",
    "year": 2024
  },
  {
    "abstract": "This article discusses strategies for the development and equitable access to antibiotics, vaccines, and diagnostics to address the global antimicrobial resistance crisis, emphasizing innovation and policy reforms.",
    "author": "Laxminarayan, Ramanan; Impalli, Isabella; Rangarajan, Radha; Cohn, Jennifer; Ramjeet, Kavi; Trainor, Betsy Wonderly; Strathdee, Steffanie; Sumpradit, Nithima; Berman, Daniel; Wertheim, Heiman; Outterson, Kevin; Srikantiah, Padmini; Theuretzbacher, Ursula",
    "doi": "10.1016/S0140-6736(24)00878-X",
    "id": "Laxminarayan-2024-AccessAMR",
    "journal": "The Lancet",
    "number": "10443",
    "pages": "2534-2550",
    "publisher": "Elsevier BV",
    "title": "Expanding antibiotic, vaccine, and diagnostics development and access to tackle antimicrobial resistance",
    "type": "article",
    "url": "https://doi.org/10.1016/S0140-6736(24)00878-X",
    "volume": "403",
    "year": 2024
  },
  {
    "abstract": "This study analyzes the burden of bacterial AMR in low- and middle-income countries, focusing on interventions to mitigate it through evidence-based approaches and modeling.",
    "author": "Lewnard, Joseph A.; Charani, Esmita; Gleason, Alec; Hsu, Li Yang; Khan, Wasif Ali; Karkey, Abhilasha; Chandler, Clare I. R.; Mashe, Tapfumanei; Khan, Ejaz Ahmed; Bulabula, Andre N. H.; Donado-Godoy, Pilar; Laxminarayan, Ramanan",
    "doi": "10.1016/S0140-6736(24)00862-6",
    "id": "Lewnard-2024-AMRBurden",
    "journal": "The Lancet",
    "number": "10442",
    "pages": "2439-2454",
    "publisher": "Elsevier BV",
    "title": "Burden of bacterial antimicrobial resistance in low-income and middle-income countries avertible by existing interventions: an evidence review and modelling analysis",
    "type": "article",
    "url": "https://doi.org/10.1016/S0140-6736(24)00862-6",
    "volume": "403",
    "year": 2024
  },
  {
    "abstract": "The study compares livestock antimicrobial resistance (AMR) data collection efforts globally with other variables relevant to AMR in humans and livestock. It identifies critical data gaps and shared priorities to strengthen AMR surveillance frameworks.",
    "author": "Venkateswaran, Narmada; Swetschinski, Lucien R.; Fastl, Christina; Di Bari, Carlotta; Criscuolo, Nicola G.; Mulchandani, Ranya; Zhao, Cheng; Meštrović, Tomislav; Ikuta, Kevin S.; Martins, Sara Babo; Coyne, Lucy A.; Afonso, João Sucena; Huntington, Ben; Rushton, Jonathan; Devleesschauwer, Brecht; Sartorius, Benn; Van Boeckel, Thomas P.; Pigott, David M.",
    "doi": "10.1186/s12879-024-09847-3",
    "id": "Venkateswaran-2024-AMRDataGaps",
    "journal": "BMC Infectious Diseases",
    "number": "1",
    "pages": "Article 1027",
    "publisher": "Springer Science and Business Media LLC",
    "title": "Using Priorities Between Human and Livestock Bacterial Antimicrobial Resistance to Identify Data Gaps in Livestock AMR Surveillance",
    "type": "article",
    "url": "https://doi.org/10.1186/s12879-024-09847-3",
    "volume": "24",
    "year": 2024
  },
  {
    "abstract": "This study uses geospatial modeling to map antimicrobial resistance (AMR) prevalence across low- and middle-income countries (LMICs). It highlights hotspots of AMR in regions like China, India, Brazil, and Africa, providing maps for targeted AMR surveillance and interventions.",
    "author": "Zhao, Cheng; Wang, Yu; Mulchandani, Ranya; Van Boeckel, Thomas P.",
    "doi": "10.1038/s41467-024-45111-7",
    "id": "Zhao-2024-AMRMaps",
    "journal": "Nature Communications",
    "number": "1",
    "pages": "Article 763",
    "publisher": "Springer Science and Business Media LLC",
    "title": "Global Surveillance of Antimicrobial Resistance in Food Animals Using Priority Drugs Maps",
    "type": "article",
    "url": "https://doi.org/10.1038/s41467-024-45111-7",
    "volume": "15",
    "year": 2024
  },
  {
    "abstract": "This paper reflects on the challenges faced by modeling groups during the COVID-19 pandemic and advocates for sustained structural support to enhance infectious disease modeling efficiency for future public health crises.",
    "author": "Le Rutte, Epke A.; Shattock, Andrew J.; Zhao, Cheng; Jagadesh, Soushieta; Balać, Miloš; Müller, Sebastian A.; Nagel, Kai; Erath, Alexander L.; Axhausen, Kay W.; Van Boeckel, Thomas P.; Penny, Melissa A.",
    "doi": "10.1016/j.epidem.2023.100734",
    "id": "LeRutte-2024-InfectiousModeling",
    "journal": "Epidemics",
    "number": "C",
    "pages": "100734",
    "publisher": "Elsevier BV",
    "title": "A Case for Ongoing Structural Support to Maximise Infectious Disease Modelling Efficiency for Future Public Health Emergencies: A Modelling Perspective",
    "type": "article",
    "url": "https://doi.org/10.1016/j.epidem.2023.100734",
    "volume": "46",
    "year": 2024
  },
  {
    "abstract": "This article examines the global extent of antimicrobial resistance and emphasizes the necessity for robust data to inform policies and mitigate the challenge effectively.",
    "author": "Okeke, Iruka N.; de Kraker, Marlieke E. A.; Van Boeckel, Thomas P.; Kumar, Chirag K.; Schmitt, Heike; Gales, Ana C.; Bertagnolio, Silvia; Sharland, Mike; Laxminarayan, Ramanan",
    "doi": "10.1016/S0140-6736(24)00876-6",
    "id": "Okeke-2024-AMRChallenge",
    "journal": "The Lancet",
    "number": "10442",
    "pages": "2426-2438",
    "publisher": "Elsevier BV",
    "title": "The Scope of the Antimicrobial Resistance Challenge",
    "type": "article",
    "url": "https://doi.org/10.1016/S0140-6736(24)00876-6",
    "volume": "403",
    "year": 2024
  },
  {
    "abstract": "This paper examines the convergence rates of stochastic natural gradient variational inference (NGVI) in probabilistic models. For models with conjugate likelihoods, the authors establish a non-asymptotic convergence rate of \\( \\mathcal{O}\\left(\\frac{1}{T}\\right) \\), comparable to that of stochastic gradient descent. In the context of non-conjugate likelihoods, they demonstrate that stochastic NGVI with canonical parameterization optimizes a non-convex objective, suggesting that achieving a global convergence rate of \\( \\mathcal{O}\\left(\\frac{1}{T}\\right) \\) would require significant advancements in understanding the optimization of the evidence lower bound using natural gradients.",
    "author": "Wu, Kaiwen; Gardner, Jacob R.",
    "doi": "10.2406.01870",
    "id": "Wu-2024-NGVI",
    "journal": "Proceedings of the 41st International Conference on Machine Learning (ICML)",
    "keywords": "Stochastic Natural Gradient Variational Inference; Non-Asymptotic Convergence; Conjugate Likelihoods; Non-Conjugate Likelihoods; Evidence Lower Bound Optimization",
    "number": "",
    "pages": "",
    "publisher": "PMLR",
    "title": "Understanding Stochastic Natural Gradient Variational Inference",
    "type": "conference",
    "url": "https://arxiv.org/abs/2406.01870",
    "volume": "",
    "year": 2024
  },
  {
    "abstract": "This study demonstrates a clinically compliant cryopreservation protocol for differentiated retinal pigment epithelial cells derived from stem cells. It validates their safety, functionality, and feasibility for transplantation in treating age-related macular degeneration.",
    "author": "Baqué-Vidal, Laura; Main, Heather; Petrus-Reurer, Sandra; Lederer, Alex R.; Beri, Nefeli-Eirini; Bär, Frederik; Metzger, Hugo; Zhao, Cheng; Efstathopoulos, Paschalis; Saietz, Sarah; Wrona, Andreas; Jaberi, Elham; Willenbrock, Hanni; Reilly, Hazel; Hedenskog, Mona; Moussaud-Lamodière, Elisabeth; Kvanta, Anders; Villaescusa, J. Carlos; La Manno, Gioele; Lanner, Fredrik",
    "doi": "10.1016/j.jcyt.2024.01.014",
    "id": "BaqueVidal-2024-RPECryopreservation",
    "journal": "Cytotherapy",
    "number": "4",
    "pages": "340-350",
    "publisher": "Elsevier BV",
    "title": "Clinically compliant cryopreservation of differentiated retinal pigment epithelial cells",
    "type": "article",
    "url": "https://doi.org/10.1016/j.jcyt.2024.01.014",
    "volume": "26",
    "year": 2024
  },
  {
    "abstract": "Across biological systems, cells undergo coordinated changes in gene expression, resulting in transcriptome dynamics that unfold within a low-dimensional manifold. While low-dimensional dynamics can be extracted using RNA velocity, these algorithms can be fragile and rely on heuristics lacking statistical control. Moreover, the estimated vector field is not dynamically consistent with the traversed gene expression manifold. To address these challenges, we introduce a Bayesian model of RNA velocity that couples velocity field and manifold estimation in a reformulated, unified framework, identifying the parameters of an explicit dynamical system. Focusing on the cell cycle, we implement VeloCycle to study gene regulation dynamics on one-dimensional periodic manifolds and validate its ability to infer cell cycle periods using live imaging. We also apply VeloCycle to reveal speed differences in regionally defined progenitors and Perturb-seq gene knockdowns. Overall, VeloCycle expands the single-cell RNA sequencing analysis toolkit with a modular and statistically consistent RNA velocity inference framework.",
    "author": "Lederer, Alex R.; Leonardi, Maxine; Talamanca, Lorenzo; Bobrovskiy, Daniil M.; Herrera, Antonio; Droin, Colas; Khven, Irina; Carvalho, Hugo J. F.; Valente, Alessandro; Dominguez Mantes, Albert; Mulet Arabí, Pau; Pinello, Luca; Naef, Felix; La Manno, Gioele",
    "doi": "10.1038/s41592-024-02471-8",
    "id": "Lederer-2024-RNAVelocity",
    "journal": "Nature Methods",
    "keywords": "RNA velocity; cell cycle; Bayesian model",
    "note": "Received 18 December 2023, Accepted 15 September 2024, Published 31 October 2024, Issue Date December 2024",
    "pages": "2271–2286",
    "publisher": "Springer Science and Business Media LLC",
    "title": "Statistical inference with a manifold-constrained RNA velocity model uncovers cell cycle speed modulations",
    "type": "article",
    "url": "https://doi.org/10.1038/s41592-024-02471-8",
    "volume": "21",
    "year": 2024
  },
  {
    "abstract": "Introduces PepNet, a neural network combining pre-trained protein language models for predicting anti-inflammatory and antimicrobial peptides. The model achieves high interpretability by mapping sequence features to biological functions, outperforming state-of-the-art methods in both prediction accuracy and model transparency.",
    "author": "Han, Jiyun; Kong, Tongxin; Liu, Juntao",
    "doi": "10.1038/s42003-024-06911-1",
    "id": "Han-2024-PepNet",
    "journal": "Communications Biology",
    "keywords": "PepNet, antimicrobial peptides, anti-inflammatory peptides, protein language models, deep learning, interpretability",
    "number": "1",
    "title": "PepNet: An interpretable neural network for anti-inflammatory and antimicrobial peptides prediction using a pre-trained protein language model",
    "type": "article",
    "url": "https://doi.org/10.1038/s42003-024-06911-1",
    "volume": "7",
    "year": 2024
  },
  {
    "abstract": "Introduces AMP-Detector, a sequence-based classification model leveraging protein language models and machine learning algorithms to identify antimicrobial peptides. The model achieves significant accuracy and has led to the discovery of over 190,000 potential AMPs, with integration into generative design resulting in over 500 novel AMPs.",
    "author": "Medina-Ortiz, David; Contreras, Seba; Fernández, Diego; Soto-García, Nicole; Moya, Iván; Cabas-Mora, Gabriel; Olivera-Nappa, Álvaro",
    "doi": "10.3390/ijms25168851",
    "id": "Medina-Ortiz-2024-AMPDetector",
    "journal": "International Journal of Molecular Sciences",
    "keywords": "antimicrobial peptides, protein language models, machine learning, AMP prediction, generative design",
    "number": "16",
    "title": "Protein Language Models and Machine Learning Facilitate the Identification of Antimicrobial Peptides",
    "type": "article",
    "url": "http://dx.doi.org/10.3390/ijms25168851",
    "volume": "25",
    "year": 2024
  },
  {
    "abstract": "The study introduces KT-AMP, a model that utilizes fine-tuning and transfer learning based on pre-trained protein language models to predict antimicrobial peptides and their specific functions. Experimental results demonstrate that KT-AMP outperforms current state-of-the-art models on benchmark datasets, highlighting the effectiveness of fine-tuning protein language models for downstream peptide prediction tasks.",
    "author": "Liang, Xiao; Zhao, Haochen; Wang, Jianxin",
    "doi": "10.1007/978-981-97-5131-0_6",
    "id": "Liang-2024-KTAMP",
    "journal": "Lecture Notes in Computer Science (LNCS), Springer",
    "keywords": "KT-AMP, protein language models, antimicrobial peptides, transfer learning, fine-tuning, peptide prediction",
    "title": "KT-AMP: Enhancing Antimicrobial Peptide Functions Prediction Through Knowledge Transfer on Protein Language Model",
    "type": "conference",
    "url": "https://doi.org/10.1007/978-981-97-5131-0_6",
    "year": 2024
  },
  {
    "abstract": "Proposes a deep learning approach, PGAT-ABPp, combining ProtT5 embeddings and graph attention networks (GAT) for antibacterial peptide identification. PGAT-ABPp demonstrates superior performance over 14 state-of-the-art models, achieving significant improvements in accuracy, F1-score, and MCC. The interpretability analysis underscores its capability to identify key residues in known active antibacterial peptides.",
    "author": "Hao, Yuelei; Liu, Xuyang; Fu, Haohao; Shao, Xueguang; Cai, Wensheng",
    "doi": "10.1093/bioinformatics/btae497",
    "id": "Hao-2024-PGATABPp",
    "journal": "Bioinformatics",
    "keywords": "protein language models, antibacterial peptides, graph attention networks, ProtT5, PGAT-ABPp, deep learning",
    "number": "8",
    "title": "PGAT-ABPp: Harnessing Protein Language Models and Graph Attention Networks for Antibacterial Peptide Identification with Remarkable Accuracy",
    "type": "article",
    "url": "https://doi.org/10.1093/bioinformatics/btae497",
    "volume": "40",
    "year": 2024
  },
  {
    "abstract": "This review discusses various AI tools for identifying and annotating antibiotic resistance genes (ARGs), exploring methods such as direct classification from genome and plasmid sequences, and feature selection. It highlights the advantages and limitations of AI approaches in augmenting traditional ARG detection methods.",
    "author": "Olatunji, Isaac; Bardaji, Danae Kala Rodriguez; Miranda, Renata Rezende; Savka, Michael A.; Hudson, André O.",
    "doi": "10.3389/fmicb.2024.1437602",
    "id": "Olatunji-2024-AIARGTools",
    "journal": "Frontiers in Microbiology",
    "keywords": "antibiotic resistance genes, artificial intelligence, ARG identification, AI tools, feature selection",
    "title": "Artificial Intelligence Tools for the Identification of Antibiotic Resistance Genes",
    "type": "article",
    "url": "https://doi.org/10.3389/fmicb.2024.1437602",
    "volume": "15",
    "year": 2024
  },
  {
    "abstract": "This study presents a novel approach for predicting antibiotic resistance gene (ARG) resistance mechanisms using ProteinBERT, a deep learning-based protein language model. The method outperforms existing techniques on diverse ARG datasets, including those with low similarity to known sequences, and offers enhanced interpretability by identifying biologically relevant features such as conserved amino acid residues and antibiotic target binding sites.",
    "author": "Yagimoto, Kanami; Hosoda, Shion; Sato, Miwa; Hamada, Michiaki",
    "doi": "10.1093/bioinformatics/btae550",
    "id": "Yagimoto-2024-ARG-BERT",
    "journal": "Bioinformatics",
    "keywords": "Protein-BERT, antimicrobial resistance, ARG prediction, resistance mechanisms, protein language models",
    "title": "Prediction of Antibiotic Resistance Mechanisms Using a Protein Language Model",
    "type": "article",
    "url": "https://doi.org/10.1093/bioinformatics/btae550",
    "year": 2024
  },
  {
    "abstract": "The paper presents ProtT5, a bilingual model capable of translating between 3D protein structures and 1D sequences. It utilizes the 3Di-alphabet and ProtT5 embeddings, improving performance on structure-related tasks with significant speedups. This is critical for searching metagenomic sequence databases with structural comparisons.",
    "author": "Heinzinger, Michael; Weissenow, Konstantin; Gomez Sanchez, Joaquin; Henkel, Adrian; Mirdita, Milot; Steinegger, Martin; Rost, Burkhard",
    "doi": "10.1093/nargab/lqae150",
    "id": "Heinzinger-2024-ProtT5",
    "journal": "NAR Genomics and Bioinformatics",
    "keywords": "ProtT5, protein language models, bilingual model, 3Di-alphabet, structural predictions, metagenomic databases",
    "number": "4",
    "title": "ProtT5: Bilingual Language Model for Protein Sequence and Structure",
    "type": "article",
    "url": "https://doi.org/10.1093/nargab/lqae150",
    "volume": "6",
    "year": 2024
  },
  {
    "abstract": "This study uses low-dimensional positional embeddings from protein language models (e.g., ESM-2) for speed-optimized local search algorithms, enhancing sensitivity in detecting distant evolutionary relationships without compromising search speed.",
    "author": "Johnson, Sean R.; Peshwa, Meghana; Sun, Zhiyi",
    "doi": "10.7554/eLife.91415",
    "id": "Johnson-2024-HomologySearch",
    "journal": "eLife",
    "keywords": "protein language models, antimicrobial resistance, homology detection, ESM-2, Foldseek, bioinformatics",
    "title": "Sensitive Remote Homology Search by Local Alignment of Small Positional Embeddings from Protein Language Models",
    "type": "article",
    "url": "https://doi.org/10.7554/eLife.91415",
    "volume": "12",
    "year": 2024
  },
  {
    "abstract": "Directed protein evolution is central to biomedical applications but faces challenges like experimental complexity, inefficient multi-property optimization, and local maxima traps. While in silico methods using protein language models (PLMs) can provide modeled fitness landscape guidance, they struggle to generalize across diverse protein families and map to protein activity. We present EVOLVEpro, a few-shot active learning framework that combines PLMs and regression models to rapidly improve protein activity. EVOLVEpro surpasses current methods, yielding up to 100-fold improvements in desired properties. We demonstrate its effectiveness across six proteins in RNA production, genome editing, and antibody binding applications. These results highlight the advantages of few-shot active learning with minimal experimental data over zero-shot predictions. EVOLVEpro opens new possibilities for AI-guided protein engineering in biology and medicine.",
    "author": "Jiang, Kaiyi; Yan, Zhaoqing; Di Bernardo, Matteo; Sgrizzi, Samantha R.; Villiger, Lukas; Kayabolen, Alisan; Kim, B.J.; Carscadden, Josephine K.; Hiraizumi, Masahiro; Nishimasu, Hiroshi; Gootenberg, Jonathan S.; Abudayyeh, Omar O.",
    "doi": "10.1126/science.adr6006",
    "id": "Jiang-2024-EVOLVEpro",
    "journal": "Science",
    "keywords": "EVOLVEpro, Protein language models, Directed evolution, Active learning, In silico protein engineering",
    "title": "Rapid in silico directed evolution by a protein language model with EVOLVEpro",
    "type": "article",
    "url": "https://www.science.org/doi/pdf/10.1126/science.adr6006",
    "year": 2024
  },
  {
    "abstract": "This study addresses the increasing demand for precise, effective, and automated protein sequence classification methods by employing natural language processing (NLP) techniques on a dataset comprising 75 target protein classes. The authors explored various machine learning and deep learning models, including K-Nearest Neighbors (KNN), Multinomial Naïve Bayes, Logistic Regression, Multi-Layer Perceptron (MLP), Decision Tree, Random Forest, XGBoost, Voting and Stacking classifiers, Convolutional Neural Network (CNN), Long Short-Term Memory (LSTM), and transformer models (BertForSequenceClassification, DistilBERT, and ProtBert). Experiments were conducted using amino acid ranges of 1-4 grams for machine learning models and different sequence lengths for CNN and LSTM models. ProtBert demonstrated the highest performance among transformer models, with an accuracy of 76.0% and an F1 score of 61.0%. Advanced NLP techniques, particularly ensemble methods and transformer models, show great potential in protein classification.",
    "author": "Perveen, Huma; Weeds, Julie",
    "doi": "10.48550/arXiv.2409.04491",
    "id": "Perveen-2024-ProteinClassification",
    "journal": "arXiv",
    "keywords": "Protein classification, NLP, ProtBERT, ensemble methods, deep learning",
    "title": "Protein Sequence Classification Using Natural Language Processing Techniques",
    "type": "article",
    "url": "https://arxiv.org/abs/2409.04491",
    "year": 2024
  },
  {
    "abstract": "This study introduces a protein language model for determining the complete sequence of a peptide based on the measurement of a limited set of amino acids. Traditional protein sequencing techniques, such as mass spectrometry and Edman degradation, face limitations in accurately identifying all amino acids, hindering comprehensive proteome analysis. The proposed method simulates partial sequencing data by selectively masking amino acids that are experimentally difficult to identify in protein sequences from the UniRef database. A ProtBERT-derived transformer-based model is then fine-tuned to predict these masked residues, providing an approximation of the complete sequence. Evaluations on three bacterial Escherichia species achieved per-amino-acid accuracy up to 90.5% when only four amino acids (K, C, Y, M) are known. Structural assessments using AlphaFold and TM-score validate the biological relevance of the predictions. The model also demonstrates potential for evolutionary analysis through cross-species performance. This integration of simulated experimental constraints with computational predictions offers a promising avenue for enhancing protein sequence analysis, potentially accelerating advancements in proteomics and structural biology.",
    "author": "Pham, Thuong Le Hoai; Saurav, Jillur Rahman; Omere, Aisosa A.; Heyl, Calvin J.; Nasr, Mohammad Sadegh; Reynolds, Cody Tyler; Veerla, Jai Prakash Yadav; Shang, Helen H.; Jaworski, Justyn; Ravenscraft, Alison; Buonomo, Joseph Anthony; Luber, Jacob M.",
    "doi": "10.48550/arXiv.2408.00892",
    "id": "Pham-2024-PeptideSequencing",
    "journal": "arXiv",
    "keywords": "Peptide sequencing, protein language models, ProtBERT, proteomics, structural biology",
    "title": "Peptide Sequencing Via Protein Language Models",
    "type": "article",
    "url": "https://arxiv.org/abs/2408.00892",
    "year": 2024
  },
  {
    "abstract": "AMPLIFY introduces a best-in-class protein language model that is orders of magnitude less expensive to train and deploy than previous models, focusing on data quality over scale.",
    "author": "Fournier, Quentin; Vernon, Robert M.; van der Sloot, Almer; Schulz, Benjamin; Chandar, Sarath; and Langmead, Christopher James",
    "id": "Fournier-2024-AMPLIFY",
    "journal": "bioRxiv",
    "keywords": "protein language models, AMPLIFY, data quality, cost-efficient training",
    "note": "bioRxiv preprint",
    "title": "Protein Language Models: Is Scaling Necessary?",
    "type": "article",
    "url": "https://www.biorxiv.org/content/10.1101/2024.09.23.614603v1",
    "year": 2024
  },
  {
    "abstract": "DPLM introduces a diffusion-based approach to protein language modeling, enabling the generation of novel protein sequences and enhancing predictive tasks through fine-tuning.",
    "author": "Wang, Xinyou; Zheng, Zaixiang; Ye, Fei; Xue, Dongyu; Huang, Shujian; and Gu, Quanquan",
    "id": "Wang-2024-DPLM",
    "journal": "Proceedings of the 41st International Conference on Machine Learning (ICML)",
    "keywords": "protein language models, DPLM, diffusion models, ICML, protein sequence generation",
    "title": "Diffusion Language Models Are Versatile Protein Learners",
    "type": "article",
    "url": "https://arxiv.org/abs/2402.18567",
    "year": 2024
  },
  {
    "abstract": "xTrimoPGLM introduces a unified protein language model capable of addressing both protein understanding and generation tasks through an innovative pre-training framework. With 100 billion parameters trained on 1 trillion tokens, it has achieved superior performance across various protein understanding benchmarks.",
    "author": "Chen, Bo; Cheng, Xingyi; Li, Pan; Geng, Yangli-ao; Gong, Jing; Li, Shen; Bei, Zhilei; Tan, Xu; Wang, Boyan; Zeng, Xin; Liu, Chiming; Zeng, Aohan; Dong, Yuxiao; Tang, Jie; and Song, Le",
    "id": "Chen-2024-xTrimoPGLM",
    "journal": "arXiv",
    "keywords": "protein language models, xTrimoPGLM, pre-trained transformer, protein understanding",
    "note": "arXiv preprint",
    "title": "xTrimoPGLM: Unified 100B-Scale Pre-trained Transformer for Deciphering the Language of Protein",
    "type": "article",
    "url": "https://arxiv.org/abs/2401.06199",
    "year": 2024
  },
  {
    "abstract": "ProteinCLIP introduces a contrastive learning approach that aligns protein amino acid sequences with textual descriptions of their functions. This alignment refines sequence embeddings to focus on functional aspects, enhancing performance in predicting protein-protein interactions and identifying homologous proteins with low sequence similarity.",
    "author": "Wu, Kevin E.; Chang, Howard; and Zou, James",
    "id": "Wu-2024-ProteinCLIP",
    "journal": "bioRxiv",
    "keywords": "protein language models, ProteinCLIP, contrastive learning, function-centric embeddings",
    "note": "bioRxiv preprint",
    "title": "ProteinCLIP: Enhancing Protein Language Models with Natural Language",
    "type": "article",
    "url": "https://www.biorxiv.org/content/10.1101/2024.05.14.594226v1",
    "year": 2024
  },
  {
    "abstract": "ProLLaMA introduces a training framework that transforms general large language models into protein-specific models capable of handling multiple protein language processing tasks. It employs low-rank adaptation and a two-stage training approach, emphasizing universality, low overhead, and scalability.",
    "author": "Lv, Liuzhenghao; Lin, Zongying; Li, Hao; Liu, Yuyang; Cui, Jiaxi; Chen, Calvin Yu-Chian; Yuan, Li; and Tian, Yonghong",
    "id": "Lv-2024-ProLLaMA",
    "journal": "arXiv",
    "keywords": "protein language models, ProLLaMA, multi-task protein processing",
    "note": "arXiv preprint",
    "title": "ProLLaMA: A Protein Large Language Model for Multi-Task Protein Language Processing",
    "type": "article",
    "url": "https://arxiv.org/abs/2402.16445",
    "year": 2024
  },
  {
    "abstract": "This study introduces atom-in-SMILES, a new tokenization method that addresses the limitations of standard SMILES tokenization by eliminating token ambiguity. It improves prediction quality in molecular translation and property prediction tasks, showing reduced token degeneration and enhanced generation accuracy for AI-driven chemical models.",
    "author": "Ucak, Umit V.; Ashyrmamatov, Islambek; Lee, Juyong",
    "doi": "10.1186/s13321-023-00725-9",
    "id": "Ucak-2023-AiS",
    "journal": "Journal of Cheminformatics",
    "pages": "Article 55",
    "title": "Improving the quality of chemical language model outcomes with atom-in-SMILES tokenization",
    "type": "article",
    "url": "https://jcheminf.biomedcentral.com/articles/10.1186/s13321-023-00725-9",
    "volume": "15",
    "year": 2023
  },
  {
    "abstract": "This perspective explores how biomolecular NMR complements deep learning-based structural predictors and how AI directly enhances NMR acquisition, analysis, and experiment design. Together, AI and NMR promise new insights into protein dynamics and conformational variability, advancing structural biology and drug discovery.",
    "author": "Shukla, Vaibhav Kumar; Heller, Gabriella T.; Hansen, D. Flemming",
    "doi": "10.1016/j.str.2023.09.011",
    "id": "Shukla-2023-NMRAI",
    "journal": "Structure",
    "number": "11",
    "pages": "1360–1374",
    "title": "Biomolecular NMR spectroscopy in the era of artificial intelligence",
    "type": "article",
    "url": "https://doi.org/10.1016/j.str.2023.09.011",
    "volume": "31",
    "year": 2023
  },
  {
    "abstract": "This review highlights the use of machine learning in computational NMR prediction and structure elucidation. It addresses the limitations of quantum mechanics methods for large or flexible molecules and surveys advances in ML-based approaches for simulating NMR data and correlating experimental results with structural candidates. The authors outline essential contributions and propose future directions for the field.",
    "author": "Cortés, Iván; Cuadrado, Cristina; Hernández Daranas, Antonio; Sarotti, Ariel M.",
    "doi": "10.3389/fntpr.2023.1122426",
    "id": "Cortes-2023-NMRMLReview",
    "journal": "Frontiers in Natural Products",
    "pages": "1122426",
    "title": "Machine learning in computational NMR-aided structural elucidation",
    "type": "article",
    "url": "https://doi.org/10.3389/fntpr.2023.1122426",
    "volume": "2",
    "year": 2023
  },
  {
    "abstract": "Structure elucidation of unknown compounds based on nuclear magnetic resonance (NMR) remains a challenging problem in both synthetic organic and natural product chemistry. Library matching has been an efficient method to assist structure elucidation. However, it is limited by the coverage of libraries. In addition, prior knowledge such as molecular fragments is neglected. To solve the problem, we propose a conditional molecular generation net (CMGNet) to allow input of multiple sources of information. CMGNet not only uses 13C NMR spectrum data as input but molecular formulas and fragments of molecules are also employed as input conditions. Our model applies large-scale pretraining for molecular understanding and fine-tuning on two NMR spectral data sets of different granularity levels to accommodate structure elucidation tasks. CMGNet generates structures based on 13C NMR data, molecular formula, and fragment information, with a recovery rate of 94.17% in the top 10 recommendations. In addition, the generative model performed well in the generation of various classes of compounds and in the structural revision task. CMGNet has a deep understanding of molecular connectivities from 13C NMR, molecular formula, and fragments, paving the way for a new paradigm of deep learning-assisted inverse problem-solving.",
    "author": "Yao, Lin; Yang, Minjian; Song, Jianfei; Yang, Zhuo; Sun, Hanyu; Shi, Hui; Liu, Xue; Ji, Xiangyang; Deng, Yafeng; Wang, Xiaojian",
    "doi": "10.1021/acs.analchem.2c05817",
    "id": "Yao-2023-CMGNet",
    "journal": "Analytical Chemistry",
    "number": "12",
    "pages": "5393–5401",
    "title": "Conditional molecular generation net enables automated structure elucidation based on {¹³C} NMR spectra and prior knowledge",
    "type": "article",
    "url": "https://doi.org/10.1021/acs.analchem.2c05817",
    "volume": "95",
    "year": 2023
  },
  {
    "abstract": "Variational autoencoders (VAEs) defined over SMILES string and graph-based representations of molecules promise to improve the optimization of molecular properties, thereby revolutionizing the pharmaceuticals and materials industries. However, these VAEs are hindered by the non-unique nature of SMILES strings and the computational cost of graph convolutions. To efficiently pass messages along all paths through the molecular graph, we encode multiple SMILES strings of a single molecule using a set of stacked recurrent neural networks, harmonizing hidden representations of each atom between SMILES representations, and use attentional pooling to build a final fixed-length latent representation. By then decoding to a disjoint set of SMILES strings of the molecule, our All SMILES VAE learns an almost bijective mapping between molecules and latent representations near the high probability mass subspace of the prior. Our SMILES-derived but molecule-based latent representations significantly surpass the state of the art in a variety of fully and semi-supervised property regression and molecular property optimization tasks.",
    "author": "Alperstein, Zaccary; Cherkasov, Artem; Rolfe, Jason T.",
    "booktitle": "QSPR/QSAR analysis using SMILES and quasi-SMILES",
    "doi": "10.1007/978-3-031-28401-4_4",
    "id": "Alperstein-2023-AllSMILES",
    "note": "Chapter in \"Challenges and Advances in Computational Chemistry and Physics\", vol. 33",
    "pages": "85–115",
    "publisher": "Springer, Cham",
    "title": "All SMILES variational autoencoder for molecular property prediction and optimization",
    "type": "incollection",
    "url": "https://doi.org/10.1007/978-3-031-28401-4_4",
    "year": 2023
  },
  {
    "abstract": "This paper develops distributionally robust stochastic optimization (DRSO) using Wasserstein distance. It provides strong duality results, explicit construction of worst-case distributions, and tractable reformulations. Key applications include infinite-dimensional process control and point process intensity estimation.",
    "author": "Gao, Rui; Kleywegt, Alexander",
    "doi": "10.1287/moor.2022.1275",
    "id": "Gao-2023-WDRO",
    "journal": "Mathematics of Operations Research",
    "pages": "603-655",
    "title": "Distributionally robust stochastic optimization with Wasserstein distance",
    "type": "article",
    "url": "https://doi.org/10.1287/moor.2022.1275",
    "volume": "48",
    "year": 2023
  },
  {
    "abstract": "UniMAP is a multi-modal molecular representation model that integrates SMILES and graph modalities through deep cross-modality fusion using a Transformer architecture. It introduces four pre-training tasks—cross-modality masking, SMILES-graph matching, fragment-level alignment, and domain knowledge learning—to align both global and local representations. UniMAP outperforms prior models on tasks such as molecular property prediction, drug-target affinity, and drug-drug interaction.",
    "author": "Feng, Shikun; Yang, Lixin; Huang, Yanwen; Ni, Yuyan; Ma, Weiying; Lan, Yanyan",
    "doi": "10.48550/arXiv.2310.14216",
    "id": "Feng-2023-UniMAP",
    "journal": "arXiv",
    "keywords": "UniMAP, molecular representation, SMILES, molecular graphs, contrastive learning, cross-modality learning",
    "title": "UniMAP: Universal SMILES-graph representation learning",
    "type": "article",
    "url": "https://arxiv.org/abs/2310.14216",
    "year": 2023
  },
  {
    "abstract": "Mole-BERT addresses limitations in molecular GNN pre-training by introducing a context-aware tokenizer to produce richer atom vocabularies and proposing two strategies: Masked Atoms Modeling (MAM) and Triplet Masked Contrastive Learning (TMCL). MAM mitigates negative transfer by using discrete atom codes for masking and prediction, while TMCL enhances graph-level representations for retrieval tasks. Together, these form a new pre-training framework that outperforms state-of-the-art methods.",
    "author": "Xia, Jun; Zhao, Chengshuai; Hu, Bozhen; Gao, Zhangyang; Tan, Cheng; Liu, Yue; Li, Siyuan; Li, Stan Z.",
    "booktitle": "ICLR 2023",
    "doi": "10.26434/chemrxiv-2023-dngg4",
    "id": "Xia-2023-MoleBERT",
    "keywords": "Mole-BERT, graph neural networks, molecular pre-training, contrastive learning, VQ-VAE, masked modeling",
    "title": "Mole-BERT: Rethinking pre-training graph neural networks for molecules",
    "type": "inproceedings",
    "url": "https://openreview.net/forum?id=jevY-DtiZTR",
    "year": 2023
  },
  {
    "abstract": "Convolutional neural networks are used to classify 37 functional groups from 50,936 infrared spectra of 30,611 molecules, enabling automatic identification without rule-based or peak-matching methods.",
    "author": "Jung, Guwon; Jung, Son Gyo; Cole, Jacqueline M.",
    "doi": "10.1039/d2sc05892h",
    "id": "Jung-2023-InfraredCNN",
    "journal": "Chemical Science",
    "keywords": "infrared spectroscopy, convolutional neural networks, functional group classification, materials characterization",
    "number": "13",
    "pages": "3600–3609",
    "title": "Automatic materials characterization from infrared spectra using convolutional neural networks",
    "type": "article",
    "url": "https://doi.org/10.1039/d2sc05892h",
    "volume": "14",
    "year": 2023
  },
  {
    "abstract": "This post explores how models like GPT-2 and GPT-3 tokenize integers inconsistently—sometimes as whole tokens, sometimes as fragments—leading to brittle and inefficient arithmetic reasoning.",
    "author": "Millidge, Beren",
    "id": "Millidge-2023-IntTokInsane",
    "journal": "beren.io",
    "keywords": "tokenization, integers, GPT-2, GPT-3, arithmetic, language models",
    "title": "Integer tokenization is insane",
    "type": "article",
    "url": "https://www.beren.io/2023-02-04-Integer-tokenization-is-insane/",
    "year": 2023
  },
  {
    "abstract": "Gisting trains LMs to compress prompts into reusable 'gist' tokens using modified attention masks during instruction fine-tuning. It reduces prompt length by up to 26×, cuts FLOPs by 40%, and lowers latency with minimal quality loss, all without retraining.",
    "author": "Mu, Jesse; Li, Xiang Lisa; Goodman, Noah",
    "doi": "10.48550/arXiv.2304.08467",
    "id": "Mu-2023-Gisting",
    "journal": "NeurIPS 2023 (arXiv preprint)",
    "keywords": "prompt compression, gisting, language models, efficiency, LLaMA, FLAN-T5",
    "title": "Learning to compress prompts with gist tokens",
    "type": "article",
    "url": "https://arxiv.org/abs/2304.08467",
    "year": 2023
  },
  {
    "abstract": "Megabyte is a multiscale transformer model for sequences exceeding one million bytes. It uses patch-wise local and global attention to scale autoregressive modeling efficiently. It achieves strong results on long-context text, ImageNet, and raw audio.",
    "author": "Yu, L.; Simig, Daniel; Flaherty, Colin; Aghajanyan, Armen; Zettlemoyer, Luke; Lewis, Mike",
    "doi": "10.48550/arXiv.2305.07185",
    "id": "Yu-2023-Megabyte",
    "journal": "NeurIPS 2023 (poster)",
    "keywords": "transformer, multiscale model, byte-level, long-context, autoregressive",
    "note": "Presented at NeurIPS 2023 (poster)",
    "title": "MEGABYTE: Predicting Million-byte Sequences with Multiscale Transformers",
    "type": "article",
    "url": "https://openreview.net/forum?id=JTmO2V9Xpz",
    "year": 2023
  },
  {
    "abstract": "Accurately estimating the true richness of a target community is still a statistical challenge, particularly in highly diverse communities. Due to sampling limitations or limited resources, undetected species are present in many surveys and observed richness is an underestimate of true richness. In the literature, methods for estimating the undetected richness of a sample are generally divided into two categories: parametric and nonparametric estimators. Imposing no assumptions on species detection rates, nonparametric methods demonstrate robust statistical performance and are widely used in ecological studies. However, nonparametric estimators may seriously underestimate richness when species composition has a high degree of heterogeneity. Parametric approaches, which reduce the number of parameters by assuming that species-specific detection probabilities follow a given statistical distribution, use traditional statistical inference to calculate species richness estimates. When species detection rates meet the model assumption, the parametric approach could supply a nearly unbiased estimator. However, the infeasibility and inefficiency of solving maximum likelihood functions limit the application of parametric methods in ecological studies when the model assumption is violated, or the collected data is sparse.",
    "author": "Chiu, Chun-Huo",
    "doi": "10.7717/peerj.14540",
    "id": "Chiu-2023-GammaPoisson",
    "journal": "PeerJ",
    "keywords": "species richness, Gamma–Poisson, parametric estimation, biodiversity",
    "pages": "e14540",
    "title": "A more reliable species richness estimator based on the Gamma–Poisson model",
    "type": "article",
    "url": "https://peerj.com/articles/14540/",
    "year": 2023
  },
  {
    "abstract": "This study investigates neural-scaling behaviour in deep chemical models by varying model and dataset sizes over multiple orders of magnitude. It examines large language models for generative chemistry and graph neural networks for interatomic potentials, providing empirical neural-scaling laws and insights into the interplay between physical priors and model scaling.",
    "author": "Frey, Nathan C.; Soklaski, Ryan; Axelrod, Simon; Samsi, Siddharth; Gómez-Bombarelli, Rafael; Coley, Connor W.; Gadepally, Vijay",
    "doi": "10.1038/s42256-023-00740-3",
    "id": "Frey-2023-NeuralScaling",
    "journal": "Nature Machine Intelligence",
    "keywords": "Neural Scaling, Deep Learning, Chemical Models, Graph Neural Networks, ChemGPT, Machine Learning",
    "number": "11",
    "pages": "1297–1305",
    "title": "Neural scaling of deep chemical models",
    "type": "article",
    "url": "https://doi.org/10.1038/s42256-023-00740-3",
    "volume": "5",
    "year": 2023
  },
  {
    "abstract": "Traditional computational approaches to design chemical species are limited by the need to compute properties for a vast number of candidates, e.g., by discriminative modeling. Therefore, inverse design methods aim to start from the desired property and optimize a corresponding chemical structure. From a machine learning viewpoint, the inverse design problem can be addressed through so-called generative modeling. Mathematically, discriminative models are defined by learning the probability distribution function of properties given the molecular or material structure. In contrast, a generative model seeks to exploit the joint probability of a chemical species with target characteristics. The overarching idea of generative modeling is to implement a system that produces novel compounds that are expected to have a desired set of chemical features, effectively sidestepping issues found in the forward design process. In this contribution, we overview and critically analyze popular generative algorithms like generative adversarial networks, variational autoencoders, flow, and diffusion models. We highlight key differences between each of the models, provide insights into recent success stories, and discuss outstanding challenges for realizing generative modeling discovered solutions in chemical applications.",
    "author": "Anstine, Dylan M.; Isayev, Olexandr",
    "doi": "10.1021/jacs.2c13467",
    "id": "Anstine-2023-GenerativeModels",
    "journal": "Journal of the American Chemical Society",
    "number": "16",
    "pages": "8736-8750",
    "title": "Generative models as an emerging paradigm in the chemical sciences",
    "type": "article",
    "url": "https://doi.org/10.1021/jacs.2c13467",
    "volume": "145",
    "year": 2023
  },
  {
    "abstract": "Accurate understanding of ultraviolet–visible (UV–vis) spectra is critical for the high-throughput synthesis of compounds for drug discovery. Experimentally determining UV–vis spectra can become expensive when dealing with a large quantity of novel compounds. This provides us an opportunity to drive computational advances in molecular property predictions using quantum mechanics and machine learning methods. In this work, we use both quantum mechanically (QM) predicted and experimentally measured UV–vis spectra as input to devise four different machine learning architectures, UVvis-SchNet, UVvis-DTNN, UVvis-Transformer, and UVvis-MPNN, and assess the performance of each method. We find that the UVvis-MPNN model outperforms the other models when using optimized 3D coordinates and QM predicted spectra as input features. This model has the highest performance for predicting UV–vis spectra with a training RMSE of 0.06 and validation RMSE of 0.08. Most importantly, our model can be used for the challenging task of predicting differences in the UV–vis spectral signatures of regioisomers.",
    "author": "McNaughton, Andrew D.; Joshi, Rajendra P.; Knutson, Carter R.; Anubhav, Fnu; Luebke, Kevin J.; Malerich, Jeremiah P.; Madrid, Peter B.; Kumar, Neeraj",
    "doi": "10.1021/acs.jcim.2c01662",
    "id": "McNaughton-2023-UVVisML",
    "journal": "Journal of Chemical Information and Modeling",
    "pages": "1462–1471",
    "title": "Machine Learning Models for Predicting Molecular UV–Vis Spectra with Quantum Mechanical Properties",
    "type": "article",
    "url": "https://pubs.acs.org/doi/10.1021/acs.jcim.2c01662",
    "volume": "63",
    "year": 2023
  },
  {
    "abstract": "The identification of molecular structure is essential for understanding chemical diversity and for developing drug leads from small molecules. Nevertheless, the structure elucidation of small molecules by Nuclear Magnetic Resonance (NMR) experiments is often a long and non-trivial process that relies on years of training. To achieve this process efficiently, several spectral databases have been established to retrieve reference NMR spectra. However, the number of reference NMR spectra available is limited and has mostly facilitated annotation of commercially available derivatives. Here, we introduce DeepSAT, a neural network-based structure annotation and scaffold prediction system that directly extracts the chemical features associated with molecular structures from their NMR spectra. Using only the ¹H-¹³C HSQC spectrum, DeepSAT identifies related known compounds and thus efficiently assists in the identification of molecular structures. DeepSAT is expected to accelerate chemical and biomedical research by accelerating the identification of molecular structures.",
    "author": "Kim, Hyun Woo; Zhang, Chen; Reher, Raphael; Wang, Mingxun; Alexander, Kelsey L.; Nothias, Louis-Félix; Han, Yoo Kyong; Shin, Hyeji; Lee, Ki Yong; Lee, Kyu Hyeong; Kim, Myeong Ji; Dorrestein, Pieter C.; Gerwick, William H.; Cottrell, Garrison W.",
    "doi": "10.1186/s13321-023-00738-4",
    "id": "Kim-2023-DeepSAT",
    "journal": "Journal of Cheminformatics",
    "number": "1",
    "pages": "71",
    "title": "DeepSAT: Learning Molecular Structures from Nuclear Magnetic Resonance Data",
    "type": "article",
    "url": "https://jcheminf.biomedcentral.com/articles/10.1186/s13321-023-00738-4",
    "volume": "15",
    "year": 2023
  },
  {
    "abstract": "The ever-increasing demand for novel materials with superior properties inspires retrofitting traditional research paradigms in the era of artificial intelligence and automation. An autonomous experimental platform (AEP) has emerged as an exciting research frontier that achieves full autonomy via integrating data-driven algorithms such as machine learning (ML) with experimental automation in the material development loop from synthesis, characterization, and analysis, to decision making. In this review, we started with a primer to describe how to develop data-driven algorithms for solving material problems. Then, we systematically summarized recent progress on automated material synthesis, ML-enabled data analysis, and decision-making. Finally, we discussed the challenges and opportunities in an endeavor to develop the next-generation AEP for ultimately realizing an autonomous or self-driving laboratory. This review will provide insights for researchers aiming to learn the frontier of ML in materials science and deploy AEP in their labs for accelerating material development.",
    "author": "Xie, Yunchao; Sattari, Kianoosh; Zhang, Chi; Lin, Jian",
    "doi": "10.1016/j.pmatsci.2022.101043",
    "id": "Xie-2023-AutonomousLabs",
    "journal": "Progress in Materials Science",
    "note": "Published in February 2023",
    "pages": "101043",
    "publisher": "Elsevier",
    "title": "Toward autonomous laboratories: Convergence of artificial intelligence and experimental automation",
    "type": "article",
    "url": "https://doi.org/10.1016/j.pmatsci.2022.101043",
    "volume": "132",
    "year": 2023
  },
  {
    "abstract": "The significance of Nuclear Magnetic Resonance (NMR) spectroscopy in organic synthesis cannot be overstated, as it plays a pivotal role in deducing chemical structures from experimental data. While machine learning has predominantly been employed for predictive purposes in the analysis of spectral data, our study introduces a novel application of a transformer-based model's attention weights to unravel the underlying 'language' that correlates spectral peaks with their corresponding atom in the chemical structures. This attention mapping technique proves beneficial for comprehending spectra, enabling accurate assignment of spectra to the respective molecules. Our approach consistently achieves correct assignment of H-NMR experimental spectra to the respective molecules in a reaction, with an accuracy exceeding 95%. Furthermore, it consistently associates peaks with the correct atoms in the molecule, achieving a remarkable peak-to-atom match rate of 71% for exact match and 89% of close shift matching (0.59ppm). This framework exemplifies the capability of harnessing the attention mechanism within transformer models to unveil the intricacies of spectroscopic data. Importantly, this approach can readily be extended to other types of spectra, showcasing its versatility and potential for broader applications in the field.",
    "author": "Schilter, Oliver; Alberts, Marvin; Zipoli, Federico; Vaucher, Alain C.; Schwaller, Philippe; Laino, Teodoro",
    "id": "Schilter-2023-AttentionNMR",
    "journal": "NeurIPS 2023 AI4Science Workshop",
    "note": "Presented at NeurIPS 2023 AI4Science Workshop. Published on 28 Oct 2023, last modified on 30 Nov 2023.",
    "title": "Unveiling the Secrets of ¹H-NMR Spectroscopy: A Novel Approach Utilizing Attention Mechanisms",
    "type": "conference",
    "url": "https://openreview.net/forum?id=TScjG5zoB0",
    "year": 2023
  },
  {
    "abstract": "The application of machine learning models in chemistry has made remarkable strides in recent years. Even though there is considerable interest in automating common procedure in analytical chemistry using machine learning, very few models have been adopted into everyday use. Among the analytical instruments available to chemists, Nuclear Magnetic Resonance (NMR) spectroscopy is one of the most important, offering insights into molecular structure unobtainable with other methods. However, most processing and analysis of NMR spectra is still performed manually, making the task tedious and time consuming especially for larger quantities of spectra. We present a transformer-based machine learning model capable of predicting the molecular structure directly from the NMR spectrum. Our model is pretrained on synthetic NMR spectra, achieving a top-1 accuracy of 67.0% when predicting the structure from both the 1H and 13C spectrum. Additionally, we train a model which, given a spectrum and a set of likely compounds, selects the one corresponding to the spectrum. This model achieves a top-1 accuracy of 96.0% when trained on 1H spectra.",
    "author": "Alberts, Marvin; Zipoli, Federico; Vaucher, Alain C.",
    "doi": "10.26434/chemrxiv-2023-8wxcz",
    "id": "Alberts-2023-TransformerNMR",
    "journal": "ChemRxiv",
    "note": "Preprint, published on ChemRxiv on 14 August 2023",
    "number": "",
    "title": "Learning the language of NMR: Structure elucidation from NMR spectra using transformer models",
    "type": "article",
    "url": "https://doi.org/10.26434/chemrxiv-2023-8wxcz",
    "year": 2023
  },
  {
    "abstract": "Nuclear magnetic resonance (NMR) spectroscopy is one of the indispensable techniques in chemistry because it enables us to obtain accurate information on the chemical, electronic, and dynamic properties of molecules. Computational simulation of the NMR spectra requires time-consuming density functional theory (DFT) calculations for an ensemble of molecular conformations. For large flexible molecules, it is considered too high-cost since it requires time-averaging of the instantaneous chemical shifts of each nuclear spin across the conformational space of molecules for NMR timescales. Here, we present a Gaussian process/deep kernel learning-based machine learning (ML) method for enabling us to predict, average in time, and analyze the instantaneous chemical shifts of conformations in the molecular dynamics trajectory. We demonstrate the use of the method by computing the averaged 1H and 13C chemical shifts of each nuclear spin of a trefoil knot molecule consisting of 24 para-connected benzene rings (240 atoms). By training ML model with the chemical shift data obtained from DFT calculations, we predicted chemical shifts for each conformation during dynamics. We were able to observe the merging of the time-averaged chemical shifts of each nuclear spin in a singlet 1H NMR peak and two 13C NMR peaks for the knot molecule, in agreement with experimental measurements. The unique feature of the presented method is the use of the learned low-dimensional deep kernel representation of local spin environments for comparing and analyzing the local chemical environment histories of spins during dynamics. It allowed us to identify two groups of protons in the knot molecule, which implies that the observed singlet 1H NMR peak could be composed of the contributions from protons with two distinct local chemical environments.",
    "author": "Tsitsvero, Mikhail; Pirillo, Jenny; Hijikata, Yuh; Komatsuzaki, Tamiki",
    "doi": "10.1063/5.0147398",
    "id": "Tsitsvero-2023-NMRKnot",
    "journal": "The Journal of Chemical Physics",
    "note": "This paper is part of the JCP Special Topic on Machine Learning Hits Molecular Simulations.",
    "number": "",
    "pages": "194108",
    "title": "NMR spectrum prediction for dynamic molecules by machine learning: A case study of trefoil knot molecule",
    "type": "article",
    "url": "https://doi.org/10.1063/5.0147398",
    "volume": "158",
    "year": 2023
  },
  {
    "author": "Van Bramer, Scott; Bastin, Loyd",
    "doi": "10.6084/m9.figshare.23739492.v1",
    "id": "VanBramer-2023-MnovaDatabase",
    "publisher": "figshare",
    "title": "Mnova Spectroscopy Database by Compound",
    "type": "dataset",
    "url": "https://doi.org/10.6084/m9.figshare.23739492.v1",
    "year": 2023
  },
  {
    "author": "Van Bramer, Scott E.; Bastin, Loyd D.",
    "doi": "10.1021/acs.jchemed.3c00046",
    "id": "VanBramer-2023-SpectroscopyTeaching",
    "journal": "Journal of Chemical Education",
    "number": "10",
    "pages": "3897-3902",
    "title": "Spectroscopy data for undergraduate teaching",
    "type": "article",
    "url": "https://doi.org/10.1021/acs.jchemed.3c00046",
    "volume": "100",
    "year": 2023
  },
  {
    "abstract": "This literature review presents a comprehensive overview of machine learning (ML) applications in proton MR spectroscopy (MRS). As the use of ML techniques in MRS continues to grow, this review aims to provide the MRS community with a structured overview of the state-of-the-art methods. Specifically, we examine and summarize studies published between 2017 and 2023 from major journals in the MR field. We categorize these studies based on a typical MRS workflow, including data acquisition, processing, analysis, and artificial data generation. Our review reveals that ML in MRS is still in its early stages, with a primary focus on processing and analysis techniques, and less attention given to data acquisition. We also found that many studies use similar model architectures, with little comparison to alternative architectures. Additionally, the generation of artificial data is a crucial topic, with no consistent method for its generation. Furthermore, many studies demonstrate that artificial data suffers from generalization issues when tested on in vivo data. We also conclude that risks related to ML models should be addressed, particularly for clinical applications. Therefore, output uncertainty measures and model biases are critical to investigate. Nonetheless, the rapid development of ML in MRS and the promising results from the reviewed studies justify further research in this field.",
    "author": "Van de Sande, Dennis M. J.; Merkofer, Julian P.; Amirrajab, Sina; Veta, Mitko; Van Sloun, Ruud J. G.; Versluis, Maarten J.; Jansen, Jacobus F. A.; Van den Brink, Johan S.; Breeuwer, Marcel",
    "doi": "10.1002/mrm.29793",
    "id": "VanDeSande-2023-MLMRSReview",
    "journal": "Magnetic Resonance in Medicine",
    "publisher": "Wiley",
    "title": "A Review of Machine Learning Applications for the Proton MR Spectroscopy Workflow",
    "type": "article",
    "url": "https://onlinelibrary.wiley.com/doi/10.1002/mrm.29793",
    "year": 2023
  },
  {
    "abstract": "Nuclear Magnetic Resonance (NMR) spectroscopic data can now be fed to algorithms capable of deconvoluting the peaks in the search for its structure.",
    "author": "Priessner, Martin",
    "doi": "10.1038/s41570-023-00538-2",
    "id": "Priessner-2023-NMRDeconvAI",
    "journal": "Nature Reviews Chemistry",
    "pages": "672",
    "publisher": "Nature",
    "title": "NMR deconvolution in the blink of an AI",
    "type": "article",
    "url": "https://doi.org/10.1038/s41570-023-00538-2",
    "volume": "7",
    "year": 2023
  },
  {
    "author": "Schmid, N.; Bruderer, S.; Paruzzo, F.; Fischetti, G.; Toscano, G.; Graf, D.; Fey, M.; Henrici, A.; Ziebart, V.; Heitmann, B.; Grabner, H.; Wegner, J.D.; Sigel, R.K.O.; Wilhelm, D.",
    "doi": "10.1016/j.jmr.2022.107357",
    "id": "Schmid-2023-NMRDeconv",
    "journal": "Journal of Magnetic Resonance",
    "pages": "107357",
    "publisher": "Elsevier BV",
    "title": "Deconvolution of 1D NMR spectra: A deep learning-based approach",
    "type": "article",
    "url": "https://doi.org/10.1016/j.jmr.2022.107357",
    "volume": "347",
    "year": 2023
  },
  {
    "abstract": "Optimal transport (OT) theory focuses on identifying the most efficient maps that transform one probability measure into another by minimizing a given cost function. Traditional approaches often utilize the squared Euclidean distance as the cost function. This paper introduces a novel model for transport maps based on translation-invariant costs of the form c(x, y) = h(x - y), where h combines a quadratic term with a regularizer τ. The authors establish a connection between these transport maps and Bregman centroids associated with the divergence generated by h, as well as the proximal operator of τ. By selecting a sparsity-inducing norm for τ, the resulting transport maps produce sparse displacement vectors Δ(x) = T(x) - x, with sparsity patterns that vary depending on x. The effectiveness of this method is demonstrated on high-dimensional single-cell transcription data, specifically in the 34,000-dimensional space of gene counts, without resorting to dimensionality reduction, thereby preserving interpretability at the gene level.",
    "author": "Cuturi, Marco; Klein, Michal; Ablin, Pierre",
    "id": "Cuturi-2023-MongeBregmanOccam",
    "journal": "Proceedings of the 40th International Conference on Machine Learning",
    "pages": "6671--6682",
    "publisher": "Proceedings of Machine Learning Research (PMLR)",
    "title": "Monge, Bregman, and Occam: Interpretable optimal transport in high-dimensions with feature-sparse maps",
    "type": "article",
    "url": "https://proceedings.mlr.press/v202/cuturi23a.html",
    "volume": "202",
    "year": 2023
  },
  {
    "abstract": "This study introduces an extended Gaussian approximation (EGA) method that incorporates third moments to more accurately calculate the energy landscape of gene regulatory networks. The authors demonstrate that the weighted summation from Gaussian approximation (WSGA) effectively computes landscapes for multistable and limit cycle systems. Applying the EGA approach to multistable genetic circuits and synthetic oscillatory networks, they show that EGA provides a more precise probability distribution and corresponding landscape compared to traditional methods.",
    "author": "Li, Chunhe; Zhang, Yusong",
    "doi": "10.1063/5.0128345",
    "id": "Li-2023-ExtendedGaussian",
    "journal": "Chaos: An Interdisciplinary Journal of Nonlinear Science",
    "number": "2",
    "pages": "023116",
    "title": "An improved approach for calculating energy landscape of gene networks from moment equations",
    "type": "article",
    "url": "https://doi.org/10.1063/5.0128345",
    "volume": "33",
    "year": 2023
  },
  {
    "abstract": "This paper addresses the dynamic Schrödinger bridge problem, which seeks a stochastic process that defines a transport between two target probability measures while being closest, in terms of Kullback-Leibler divergence, to a reference process. The author proposes a novel sampling-based iterative algorithm, the iterated diffusion bridge mixture (IDBM) procedure, aimed at solving this problem. The IDBM procedure exhibits the attractive property of realizing a valid transport between the target probability measures at each iteration. The paper includes a theoretical investigation establishing the convergence properties of the IDBM procedure, complemented by numerical experiments illustrating its competitive performance. Additionally, the author suggests utilizing the first iteration of the IDBM procedure as an approximation-free method for generative modeling, offering greater flexibility in selecting the generative process dynamics and exhibiting accelerated training and superior sample quality over larger discretization intervals.",
    "author": "Peluchetti, Stefano",
    "doi": "10.48550/arXiv.2304.00917",
    "id": "Peluchetti-2023-DiffBridgeSchrodinger",
    "journal": "Journal of Machine Learning Research",
    "title": "Diffusion Bridge Mixture Transports, Schrödinger Bridge Problems and Generative Modeling",
    "type": "article",
    "url": "https://jmlr.org/papers/v24/23-0527.html",
    "volume": "24",
    "year": 2023
  },
  {
    "abstract": "Complex biological processes, such as cellular differentiation, require an intricate rewiring of intra-cellular signalling networks. Previous characterisations of these networks revealed that promiscuity in signalling, quantified by a raised network entropy, underlies a less differentiated and malignant cell state. A theoretical connection between entropy and Ricci curvature has led to applications of discrete curvatures to characterise biological signalling networks at distinct time points during differentiation and malignancy. However, understanding and predicting the dynamics of biological network rewiring remains an open problem. Here we construct a framework to apply discrete Ricci curvature and Ricci flow to the problem of biological network rewiring. By investigating the relationship between network entropy and Forman-Ricci curvature, both theoretically and empirically on single-cell RNA-sequencing data, we demonstrate that the two measures do not always positively correlate, as has been previously suggested, and provide complementary rather than interchangeable information. We next employ discrete normalised Ricci flow, to derive network rewiring trajectories from transcriptomes of stem cells to differentiated cells, which accurately predict true intermediate time points of gene expression time courses. In summary, we present a differential geometry toolkit for investigation of dynamic network rewiring during cellular differentiation and cancer.",
    "author": "Baptista, Anthony; MacArthur, Ben D.; Banerji, Christopher R. S.",
    "doi": "10.1101/2023.07.20.549833",
    "id": "Baptista-2024-RicciFlow",
    "journal": "bioRxiv",
    "keywords": "Cellular Differentiation, Ricci Flow, Network Rewiring, Discrete Geometry",
    "pages": "N/A",
    "title": "Charting cellular differentiation trajectories with Ricci flow",
    "type": "article",
    "url": "https://doi.org/10.1101/2023.07.20.549833",
    "volume": "N/A",
    "year": 2023
  },
  {
    "abstract": "This paper addresses the dynamic formulation of optimal transport, known as the Schrödinger bridge (SB) problem, specifically between Gaussian measures. The authors derive closed-form expressions for SBs between Gaussian measures, utilizing tools from entropic optimal transport, Riemannian geometry, and generator theory. They demonstrate that solutions to SBs between Gaussian measures are Gaussian processes with explicit mean and covariance kernels, facilitating applications in generative modeling and interpolation. The paper also introduces a method for modeling the evolution of single-cell genomics data, showing improved numerical stability over existing SB-based approaches.",
    "author": "Bunne, Charlotte; Hsieh, Ya-Ping; Cuturi, Marco; Krause, Andreas",
    "booktitle": "The 26th International Conference on Artificial Intelligence and Statistics (AISTATS)",
    "id": "Bunne-2023-Bridge",
    "journal": "Proceedings of Machine Learning Research",
    "keywords": "Schrödinger Bridge, Gaussian Measures, Optimal Transport, Generative Modeling",
    "pages": "5802–5833",
    "title": "The Schrödinger Bridge between Gaussian Measures has a Closed Form",
    "type": "article",
    "url": "https://proceedings.mlr.press/v206/bunne23a.html",
    "volume": "206",
    "year": 2023
  },
  {
    "abstract": "The ability to understand and predict molecular responses towards external perturbations is a core question in molecular biology. Technological advancements in the recent past have enabled the generation of high-resolution single-cell data, making it possible to profile individual cells under different experimentally controlled perturbations. However, cells are typically destroyed during measurement, resulting in unpaired distributions over either perturbed or non-perturbed cells. Leveraging the theory of optimal transport and the recent advents of convex neural architectures, we learn a coupling describing the response of cell populations upon perturbation, enabling us to predict state trajectories on a single-cell level. We apply our approach, CellOT, to predict treatment responses of 21,650 cells subject to four different drug perturbations. CellOT outperforms current state-of-the-art methods both qualitatively and quantitatively, accurately capturing cellular behavior shifts across all different drugs.",
    "author": "Bunne, Charlotte; Stark, Stefan G.; Gut, Gabriele; Del Castillo, Jacobo Sarabia; Levesque, Marc; Lehmann, Kjong-Van; Pelkmans, Lucas; Krause, Andreas; Rätsch, Gunnar",
    "doi": "10.1038/s41592-023-01969-x",
    "id": "Bunne-2023-SingleCellResponses",
    "journal": "Nature Methods",
    "keywords": "Distribution-to-Distribution Regression; Single-Cell Perturbation; Neural Optimal Transport; Scalable Machine Learning; Single-Cell RNA Sequencing; Probabilistic Regression; Cell Response Prediction",
    "number": "11",
    "pages": "1759–1768",
    "title": "Learning single-cell perturbation responses using neural optimal transport",
    "type": "article",
    "url": "https://doi.org/10.1038/s41592-023-01969-x",
    "volume": "20",
    "year": 2023
  },
  {
    "abstract": "This study introduces a probabilistic model called Gaussian Process Spatial Alignment (GPSA) to align spatially resolved genomic samples. GPSA uses a two-layer Gaussian process: the first maps observed samples' spatial locations onto a common coordinate system (CCS), and the second maps the CCS to observed phenotypic readouts (e.g., gene expression). The model facilitates analyses like variance assessment, 3D atlas creation from 2D slices, and association tests across data modalities.",
    "author": "Jones, Andrew; Townes, F. William; Li, Didong; Engelhardt, Barbara E.",
    "doi": "10.1038/s41592-023-01972-2",
    "id": "Jones-2023-GPSA",
    "journal": "Nature Methods",
    "number": "9",
    "pages": "1379-1387",
    "title": "Alignment of spatial genomics data using deep Gaussian processes",
    "type": "article",
    "url": "https://doi.org/10.1038/s41592-023-01972-2",
    "volume": "20",
    "year": 2023
  },
  {
    "abstract": "This study explores the characteristics and spread of plasmid-borne colistin resistance genes (mcr) among pigs, workers with animal contact, and their household members in Thailand. It identifies various mcr variants and plasmid types, emphasizing the need for monitoring and interventions to control the spread of colistin resistance through horizontal gene transfer.",
    "author": "Leangapichart, Thongpan; Stosic, Milan S.; Hickman, Rachel A.; Lunha, Kamonwan; Jiwakanon, Jatesada; Angkititrakul, Sunpetch; Magnusson, Ulf; Van Boeckel, Thomas P.; Järhult, Josef D.; Sunde, Marianne",
    "doi": "10.1093/jac/dkad097",
    "id": "Leangapichart-2023-MCRGenesThailand",
    "journal": "Journal of Antimicrobial Chemotherapy",
    "number": "6",
    "pages": "1395-1405",
    "publisher": "Oxford University Press (OUP)",
    "title": "Exploring the Epidemiology of mcr Genes, Genetic Context, and Plasmids in Enterobacteriaceae Originating from Pigs and Humans on Farms in Thailand",
    "type": "article",
    "url": "https://doi.org/10.1093/jac/dkad097",
    "volume": "78",
    "year": 2023
  },
  {
    "abstract": "This study examines global drivers of antimicrobial resistance (AMR) emergence, focusing on human and livestock antibiotic use, economic activity, and travel. Predictive models and maps highlight surveillance gaps and intervention priorities.",
    "author": "Mendelsohn, Emma; Ross, Noam; Zambrana-Torrelio, Carlos; Van Boeckel, Thomas P.; Laxminarayan, Ramanan; Daszak, Peter",
    "doi": "10.1098/rspb.2023.1085",
    "id": "Mendelsohn-2023-AMREmergence",
    "journal": "Proceedings of the Royal Society B: Biological Sciences",
    "number": "2000",
    "pages": "20231085",
    "publisher": "The Royal Society",
    "title": "Global Patterns and Correlates in the Emergence of Antimicrobial Resistance in Humans",
    "type": "article",
    "url": "https://doi.org/10.1098/rspb.2023.1085",
    "volume": "290",
    "year": 2023
  },
  {
    "abstract": "This paper explores the potential of implementing taxation on veterinary antibiotics as a strategy to curb their overuse and combat antimicrobial resistance (AMR). By analyzing economic incentives and regulatory frameworks, it emphasizes the importance of balancing public health priorities with sustainable agricultural practices.",
    "author": "Morgan, Alex L.K.; Moran, Dominic; Van Boeckel, Thomas P.",
    "doi": "10.1016/j.onehlt.2023.100650",
    "id": "Morgan-2023-TaxationAMR",
    "journal": "One Health",
    "number": "C",
    "pages": "100650",
    "publisher": "Elsevier BV",
    "title": "Taxation of Veterinary Antibiotics to Reduce Antimicrobial Resistance",
    "type": "article",
    "url": "https://doi.org/10.1016/j.onehlt.2023.100650",
    "volume": "17",
    "year": 2023
  },
  {
    "abstract": "This study estimates antimicrobial usage in food-producing animals globally and projects future trends from 2020 to 2030. It identifies hotspots of antimicrobial use, mainly in Asia, and emphasizes the need for national reporting and stewardship programs to address antimicrobial resistance (AMR) risks.",
    "author": "Mulchandani, Ranya; Wang, Yu; Gilbert, Marius; Van Boeckel, Thomas P.",
    "doi": "10.1371/journal.pgph.0001305",
    "id": "Mulchandani-2023-AMRTrends",
    "journal": "PLOS Global Public Health",
    "number": "2",
    "pages": "e0001305",
    "publisher": "Public Library of Science (PLoS)",
    "title": "Global Trends in Antimicrobial Use in Food-Producing Animals: 2020 to 2030",
    "type": "article",
    "url": "https://doi.org/10.1371/journal.pgph.0001305",
    "volume": "3",
    "year": 2023
  },
  {
    "abstract": "The study investigates the geographic dispersal and host transitions of Streptococcus agalactiae sequence type 283 (ST283), which has been associated with human and fish infections. Using Bayesian phylogeographic analyses, the research reveals bidirectional human-fish host switching, suggesting that wastewater management and sanitation can reduce transmission between hosts. The study underscores the importance of surveillance to mitigate risks associated with this pathogen.",
    "author": "Schar, Daniel; Zhang, Zhenyu; Pires, Joao; Vrancken, Bram; Suchard, Marc A.; Lemey, Philippe; Ip, Margaret; Gilbert, Marius; Van Boeckel, Thomas P.; Dellicour, Simon",
    "doi": "10.1371/journal.pgph.0002454",
    "id": "Schar-2023-StreptococcusSwitching",
    "journal": "PLOS Global Public Health",
    "number": "10",
    "pages": "e0002454",
    "publisher": "Public Library of Science (PLoS)",
    "title": "Dispersal History and Bidirectional Host Switching of Streptococcus agalactiae",
    "type": "article",
    "url": "https://doi.org/10.1371/journal.pgph.0002454",
    "volume": "3",
    "year": 2023
  },
  {
    "abstract": "This paper introduces the concept of lipotype as a functional determinant in cellular processes, emphasizing the dynamic roles of lipid composition and diversity in membrane biology.",
    "author": "D’Angelo, Giovanni; La Manno, Gioele",
    "doi": "10.1038/s41580-022-00556-w",
    "id": "DAngelo-2022-Lipotype",
    "journal": "Nature Reviews Molecular Cell Biology",
    "number": "1",
    "pages": "1-2",
    "publisher": "Springer Science and Business Media LLC",
    "title": "The lipotype hypothesis",
    "type": "article",
    "url": "https://doi.org/10.1038/s41580-022-00556-w",
    "volume": "24",
    "year": 2023
  },
  {
    "abstract": "This publication investigates the region-specific and age-related diversity of glial cells in the CNS, highlighting sex differences and implications for therapeutic strategies in neurodegenerative diseases.",
    "author": "Seeker, Luise A.; Bestard-Cuche, Nadine; Jäkel, Sarah; Kazakou, Nina-Lydia; Bøstrand, Sunniva M. K.; Wagstaff, Laura J.; Cholewa-Waclaw, Justyna; Kilpatrick, Alastair M.; Van Bruggen, David; Kabbe, Mukund; Baldivia Pohl, Fabio; Moslehi, Zahra; Henderson, Neil C.; Vallejos, Catalina A.; La Manno, Gioele; Castelo-Branco, Goncalo; Williams, Anna",
    "doi": "10.1186/s40478-023-01568-z",
    "id": "Seeker-2023-BrainMatters",
    "journal": "Acta Neuropathologica Communications",
    "number": "1",
    "pages": "Article 84",
    "publisher": "Springer Science and Business Media LLC",
    "title": "Brain matters: unveiling the distinct contributions of region, age, and sex to glia diversity and CNS function",
    "type": "article",
    "url": "https://doi.org/10.1186/s40478-023-01568-z",
    "volume": "11",
    "year": 2023
  },
  {
    "abstract": "This publication explores the diversity and developmental trajectories of neocortical progenitors in mammalian brains, contributing to our understanding of cortical evolution and neurogenesis.",
    "author": "Alieh, Leila Haj Abdullah; Herrera, Antonio; La Manno, Gioele",
    "doi": "10.1016/j.coisb.2023.100444",
    "id": "Alieh-2023-NeocorticalProgenitors",
    "journal": "Current Opinion in Systems Biology",
    "pages": "100444",
    "publisher": "Elsevier",
    "title": "Heterogeneity and developmental dynamics of mammalian neocortical progenitors",
    "type": "article",
    "url": "https://doi.org/10.1016/j.coisb.2023.100444",
    "volume": "32-33",
    "year": 2023
  },
  {
    "abstract": "This study presents LSSAMP, a model that combines sequence and structural information for designing antimicrobial peptides. Utilizing multi-scale vector quantization, LSSAMP captures secondary structures, enabling the generation of peptides with favorable attributes. Experimental results indicate a high probability of antimicrobial activity in the generated peptides, with two candidates showing strong efficacy in laboratory tests.",
    "author": "Wang, Danqing; Wen, Zeyu; Ye, Fei; Li, Lei; Zhou, Hao",
    "doi": "10.48550/arXiv.2212.09450",
    "id": "Wang-2022-LSSAMP",
    "journal": "International Conference on Learning Representations (ICLR)",
    "keywords": "antimicrobial peptides, de novo design, latent sequence-structure model, secondary structure, peptide generation",
    "title": "Accelerating Antimicrobial Peptide Discovery with Latent Sequence-Structure Model",
    "type": "conference",
    "url": "https://doi.org/10.48550/arXiv.2212.09450",
    "year": 2023
  },
  {
    "abstract": "Introduces PLM-ARG, an artificial intelligence-based framework utilizing a large protein language model to identify ARGs and classify resistance categories. PLM-ARG demonstrates superior performance, achieving MCCs of 0.983 in cross-validation and 0.838 in independent validation, significantly outperforming existing ARG prediction tools.",
    "author": "Wu, Jun; Ouyang, Jian; Qin, Haipeng; Zhou, Jiajia; Roberts, Ruth; Siam, Rania; Wang, Lan; Tong, Weida; Liu, Zhichao; Shi, Tieliu",
    "doi": "10.1093/bioinformatics/btad690",
    "id": "Wu-2023-PLMARG",
    "journal": "Bioinformatics",
    "keywords": "protein language models, PLM-ARG, antibiotic resistance genes, ARG identification, AI frameworks",
    "number": "11",
    "title": "PLM-ARG: Antibiotic Resistance Gene Identification Using a Pretrained Protein Language Model",
    "type": "article",
    "url": "https://doi.org/10.1093/bioinformatics/btad690",
    "volume": "39",
    "year": 2023
  },
  {
    "abstract": "This study introduces pLM-BLAST, a tool inspired by BLAST that uses single-sequence embeddings from ProtT5 for homology detection. It outperforms HHsearch in both accuracy and speed, especially for divergent sequences, and offers a local alignment feature for discovering novel homologs.",
    "author": "Kaminski, Kamil; Ludwiczak, Jan; Pawlicki, Kamil; Alva, Vikram; Dunin-Horkawicz, Stanislaw",
    "doi": "10.1093/bioinformatics/btad579",
    "id": "Kaminski-2023-pLMBLAST",
    "journal": "Bioinformatics",
    "keywords": "protein language models, pLM-BLAST, ProtT5, homology detection, antimicrobial resistance",
    "number": "10",
    "title": "pLM-BLAST: Distant Homology Detection Based on Direct Comparison of Sequence Representations from Protein Language Models",
    "type": "article",
    "url": "https://doi.org/10.1093/bioinformatics/btad579",
    "volume": "39",
    "year": 2023
  },
  {
    "abstract": "This study introduces PeptideBERT, a protein language model tailored for predicting peptide properties such as hemolysis, solubility, and non-fouling. Utilizing the ProtBERT pretrained transformer model with 12 attention heads and 12 hidden layers, PeptideBERT is fine-tuned for these specific tasks. The model achieves state-of-the-art performance in predicting hemolysis and demonstrates remarkable accuracy in assessing a peptide's capacity to resist non-specific interactions.",
    "author": "Guntuboina, Chakradhar; Das, Adrita; Mollaei, Parisa; Kim, Seongwon; Farimani, Amir Barati",
    "doi": "10.1021/acs.jpclett.3c02398",
    "id": "Guntuboina-2023-PeptideBERT",
    "journal": "The Journal of Physical Chemistry Letters",
    "keywords": "PeptideBERT, peptide property prediction, ProtBERT, hemolysis, solubility, non-fouling",
    "title": "PeptideBERT: A Language Model Based on Transformers for Peptide Property Prediction",
    "type": "article",
    "url": "https://pubs.acs.org/doi/10.1021/acs.jpclett.3c02398",
    "year": 2023
  },
  {
    "abstract": "This study introduces a workflow utilizing ESM1b, a 650-million-parameter protein language model, to predict the functional impact of all ~450 million possible missense variants in the human genome. The model outperforms existing methods in classifying approximately 150,000 ClinVar and HGMD missense variants as pathogenic or benign and in predicting outcomes across 28 deep mutational scan datasets.",
    "author": "Brandes, Nadav; Goldman, Grant; Wang, Charlotte H.; Ye, Chun Jimmie; Ntranos, Vasilis",
    "doi": "10.1038/s41588-023-01465-0",
    "id": "Brandes-2023-GenomeWidePrediction",
    "journal": "Nature Genetics",
    "keywords": "protein language model, missense variants, functional impact prediction, ESM1b",
    "number": "9",
    "pages": "1512-1522",
    "title": "Genome-wide prediction of disease variant effects with a deep protein language model",
    "type": "article",
    "url": "https://www.nature.com/articles/s41588-023-01465-0",
    "volume": "55",
    "year": 2023
  },
  {
    "abstract": "This study presents ProGen, a language model trained on 280 million protein sequences from over 19,000 families. ProGen can generate protein sequences with predictable functions across diverse families, akin to generating grammatically and semantically correct natural language sentences on various topics. The model was fine-tuned to specific lysozyme families, producing artificial proteins with catalytic efficiencies comparable to natural lysozymes, despite sequence identities as low as 31.4%.",
    "author": "Madani, Ali; Krause, Ben; Greene, Eric R.; Subramanian, Subu; Mohr, Benjamin P.; Holton, James M.; Olmos, Jose Luis; Xiong, Caiming; Sun, Zachary Z.; Socher, Richard; Fraser, James S.; Naik, Nikhil",
    "doi": "10.1038/s41587-022-01618-2",
    "id": "Madani-2023-ProGen",
    "journal": "Nature Biotechnology",
    "keywords": "ProGen, Protein language model, Controlled protein generation, Functional proteins, Lysozymes",
    "number": "8",
    "pages": "1099–1106",
    "title": "Large language models generate functional protein sequences across diverse families",
    "type": "article",
    "volume": "41",
    "year": 2023
  },
  {
    "abstract": "This paper introduces Ankh, a general-purpose protein language model trained on Google's TPU-v4. Ankh surpasses state-of-the-art performance with fewer parameters, emphasizing data-efficient and cost-reduced optimization for protein modeling tasks.",
    "author": "Elnaggar, Ahmed; Essam, Hazem; Salah-Eldin, Wafaa; Moustafa, Walid; Elkerdawy, Mohamed; Rochereau, Charlotte; Rost, Burkhard",
    "doi": "10.48550/arXiv.2301.06568",
    "id": "Elnaggar-2023-Ankh",
    "journal": "arXiv preprint arXiv:2301.06568",
    "keywords": "Ankh, Protein language model, Data-efficient training, General-purpose modeling",
    "title": "Ankh: Optimized Protein Language Model Unlocks General-Purpose Modelling",
    "type": "article",
    "year": 2023
  },
  {
    "abstract": "Protein language models trained on evolutionary-scale sequence data learn structure and function without explicit supervision. We develop ESM-2, a family of protein language models ranging up to 15 billion parameters, and show that scaling up language models leads to improved performance across a range of structure prediction tasks. We introduce ESMFold, a method for predicting accurate protein structures directly from the individual sequences of these models. ESMFold achieves high accuracy, with average TM-scores of 0.85 on recent sequences without close homologs, and is efficient, predicting structures in minutes. We predict structures for the entire UniProt database, covering over 617 million proteins, and make these predictions available to the community. Our results demonstrate the potential of language models to accelerate structural biology by enabling the prediction of structures at the scale of the known protein universe.",
    "author": "Lin, Zeming; Akin, Halil; Rao, Roshan; Hie, Brian; Zhu, Zhongkai; Lu, Wenting; Smetanin, Nikita; Verkuil, Robert; Kabeli, Ori; Shmueli, Yaniv; Costa, Allan dos Santos; Fazel-Zarandi, Maryam; Sercu, Tom; Candido, Salvatore; Rives, Alexander",
    "doi": "10.1126/science.ade2574",
    "id": "Lin-2023-ESMFold",
    "journal": "Science",
    "keywords": "Protein structure prediction, Language model, ESM-2, ESMFold, Evolutionary-scale sequence data, Structural biology, UniProt database",
    "pages": "1123-1130",
    "title": "Evolutionary-scale prediction of atomic-level protein structure with a language model",
    "type": "article",
    "volume": "379",
    "year": 2023
  },
  {
    "abstract": "ML-J-DP4 integrates fast J-coupling calculations with machine-learning-corrected HF/STO-3G NMR predictions into the J-DP4 formalism, enabling accurate structure elucidation of complex molecules within minutes on standard hardware.",
    "author": "Tsai, Yi-Hsuan; Amichetti, Milagros; Zanardi, María Marta; Grimson, Rafael; Hernandez Daranas, Antonio; Sarotti, Ariel M.",
    "doi": "10.1021/acs.orglett.2c01251",
    "id": "Tsai-2022-MLJDP4",
    "journal": "Organic Letters",
    "number": "41",
    "pages": "7487–7491",
    "title": "ML-J-DP4: An Integrated Quantum Mechanics-Machine Learning Approach for Ultrafast NMR Structural Elucidation",
    "type": "article",
    "url": "https://doi.org/10.1021/acs.orglett.2c01251",
    "volume": "24",
    "year": 2022
  },
  {
    "abstract": "Proposes a scalable GNN for NMR chemical shift prediction using a sparsified heavy-atom graph, attention-based message passing, and dual-level readout, achieving higher accuracy and efficiency on ¹³C and ¹H datasets.",
    "author": "Han, Jongmin; Kang, Hyungu; Kang, Seokho; Kwon, Youngchun; Lee, Dongseon; Choi, Youn-Suk",
    "doi": "10.1039/D2CP04542G",
    "id": "Han-2022-ScalableGNN",
    "journal": "Physical Chemistry Chemical Physics",
    "pages": "26870–26878",
    "title": "Scalable graph neural network for NMR chemical shift prediction",
    "type": "article",
    "url": "https://doi.org/10.1039/D2CP04542G",
    "volume": "24",
    "year": 2022
  },
  {
    "abstract": "This review surveys methods for predicting NMR chemical shifts, contrasting data-driven approaches with traditional ab initio methods. It highlights the advantages of empirical models—such as speed and scalability—enabled by modern ML techniques and large spectral databases, and discusses their applications, limitations, and underlying data resources.",
    "author": "Jonas, Eric; Kuhn, Stefan; Schlörer, Nils",
    "doi": "10.1002/mrc.5234",
    "id": "Jonas-2022-ShiftReview",
    "journal": "Magnetic Resonance in Chemistry",
    "number": "11",
    "pages": "1021–1031",
    "title": "Prediction of chemical shift in NMR: A review",
    "type": "article",
    "url": "https://doi.org/10.1002/mrc.5234",
    "volume": "60",
    "year": 2022
  },
  {
    "abstract": "This study proposes a deep reinforcement learning framework for solving the molecular inverse problem: predicting molecular structures from NMR spectra. The method combines Monte Carlo tree search and graph convolutional networks to iteratively construct molecules, achieving ~80% top-3 accuracy for small molecules (<10 heavy atoms).",
    "author": "Sridharan, Bhuvanesh; Mehta, Sarvesh; Pathak, Yashaswi; Priyakumar, U. Deva",
    "doi": "10.1021/acs.jpclett.2c00624",
    "id": "Sridharan-2022-DRLNMR",
    "journal": "Journal of Physical Chemistry Letters",
    "number": "22",
    "pages": "4924–4933",
    "title": "Deep reinforcement learning for molecular inverse problem of NMR spectra to molecular structure",
    "type": "article",
    "url": "https://doi.org/10.1021/acs.jpclett.2c00624",
    "volume": "13",
    "year": 2022
  },
  {
    "abstract": "This study introduces feature selection methods—equidistant and peak sampling—for NMR spectral data, evaluated using SVM and KNN classifiers. Peak sampling yielded superior results. A recurrent neural network (RNN) model trained on peak-sampled data further improved generalization and hyperparameter optimization over traditional models.",
    "author": "Li, Chongcan; Cong, Yong; Deng, Weihua",
    "doi": "10.1002/mrc.5292",
    "id": "Li-2022-NMRGroups",
    "journal": "Magnetic Resonance in Chemistry",
    "number": "12",
    "pages": "1061–1069",
    "title": "Identifying molecular functional groups of organic compounds by deep learning of NMR data",
    "type": "article",
    "url": "https://doi.org/10.1002/mrc.5292",
    "volume": "60",
    "year": 2022
  },
  {
    "abstract": "Synthesis planning and reaction outcome prediction are two fundamental problems in computer-aided organic chemistry for which a variety of data-driven approaches have emerged. Natural language approaches that model each problem as a SMILES-to-SMILES translation lead to a simple end-to-end formulation, reduce the need for data preprocessing, and enable the use of well-optimized machine translation model architectures. However, SMILES representations are not efficient for capturing information about molecular structures, as evidenced by the success of SMILES augmentation to boost empirical performance. Here, we describe a novel Graph2SMILES model that combines the power of Transformer models for text generation with the permutation invariance of molecular graph encoders that mitigates the need for input data augmentation. In our encoder, a directed message passing neural network (D-MPNN) captures local chemical environments, and the global attention encoder allows for long-range and intermolecular interactions, enhanced by graph-aware positional embedding. As an end-to-end architecture, Graph2SMILES can be used as a drop-in replacement for the Transformer in any task involving molecule(s)-to-molecule(s) transformations, which we empirically demonstrate leads to improved performance on existing benchmarks for both retrosynthesis and reaction outcome prediction.",
    "author": "Tu, Zhengkai; Coley, Connor W.",
    "doi": "10.1021/acs.jcim.2c00321",
    "id": "Tu-2022-Graph2SMILES",
    "journal": "Journal of Chemical Information and Modeling",
    "note": "Machine Learning and Deep Learning issue, published July 26, 2022",
    "number": "15",
    "pages": "3503–3513",
    "publisher": "American Chemical Society",
    "title": "Permutation invariant graph-to-sequence model for template-free retrosynthesis and reaction prediction",
    "type": "article",
    "url": "https://doi.org/10.1021/acs.jcim.2c00321",
    "volume": "62",
    "year": 2022
  },
  {
    "abstract": "MolCLR is a self-supervised framework for molecular property prediction that pretrains graph neural network encoders using contrastive learning on ~10M unlabelled molecules. It applies three augmentations—atom masking, bond deletion, and subgraph removal—and maximizes agreement between augmentations of the same molecule while minimizing it across different molecules. MolCLR improves GNN performance on classification and regression benchmarks, achieving state-of-the-art results after fine-tuning and capturing chemically meaningful similarities.",
    "author": "Wang, Yuyang; Wang, Jianren; Cao, Zhonglin; Farimani, Amir Barati",
    "doi": "10.1038/s42256-022-00447-x",
    "id": "Wang-2022-MolCLR",
    "journal": "Nature Machine Intelligence",
    "pages": "279--287",
    "title": "Molecular contrastive learning of representations via graph neural networks",
    "type": "article",
    "url": "https://www.nature.com/articles/s42256-022-00447-x",
    "volume": "4",
    "year": 2022
  },
  {
    "abstract": "Deep learning has been a prevalence in computational chemistry and widely implemented in molecular property predictions. Recently, self-supervised learning (SSL), especially contrastive learning (CL), has gathered growing attention for the potential to learn molecular representations that generalize to the gigantic chemical space. Unlike supervised learning, SSL can directly leverage large unlabeled data, which greatly reduces the effort to acquire molecular property labels through costly and time-consuming simulations or experiments. However, most molecular SSL methods borrow the insights from the machine learning community but neglect the unique cheminformatics (e.g., molecular fingerprints) and multilevel graphical structures (e.g., functional groups) of molecules. In this work, we propose iMolCLR, improvement of Molecular Contrastive Learning of Representations with graph neural networks (GNNs) in two aspects: (1) mitigating faulty negative contrastive instances via considering cheminformatics similarities between molecule pairs and (2) fragment-level contrasting between intramolecule and intermolecule substructures decomposed from molecules. Experiments have shown that the proposed strategies significantly improve the performance of GNN models on various challenging molecular property predictions. In comparison to the previous CL framework, iMolCLR demonstrates an averaged 1.2% improvement of ROC-AUC on eight classification benchmarks and an averaged 10.1% decrease of the error on six regression benchmarks. On most benchmarks, the generic GNN pretrained by iMolCLR rivals or even surpasses supervised learning models with sophisticated architectures and engineered features. Further investigations demonstrate that representations learned through iMolCLR intrinsically embed scaffolds and functional groups that can reason molecule similarities.",
    "author": "Wang, Yuyang; Magar, Rishikesh; Liang, Chen; Farimani, Amir Barati",
    "doi": "10.1021/acs.jcim.2c00495",
    "id": "Wang-2022-iMolCLR",
    "journal": "Journal of Chemical Information and Modeling",
    "number": "11",
    "pages": "2713–2725",
    "publisher": "American Chemical Society",
    "title": "Improving molecular contrastive learning via faulty negative mitigation and decomposed fragment contrast",
    "type": "article",
    "url": "https://doi.org/10.1021/acs.jcim.2c00495",
    "volume": "62",
    "year": 2022
  },
  {
    "abstract": "MM-Deacon is a multilingual molecular embedding framework pre-trained using contrastive learning on SMILES and IUPAC representations. By aligning embeddings across molecular languages, it improves robustness in molecular property prediction, zero-shot cross-lingual retrieval, and drug-drug interaction prediction.",
    "author": "Guo, Zhihui; Sharma, Pramod; Martinez, Andy; Du, Liang; Abraham, Robin",
    "booktitle": "Proceedings of the 60th Annual Meeting of the Association for Computational Linguistics (Volume 1: Long Papers)",
    "doi": "10.18653/v1/2022.acl-long.242",
    "id": "Guo-2022-MMDeacon",
    "keywords": "MM-Deacon, multilingual molecular representation, contrastive learning, SMILES, IUPAC, chemical language models",
    "pages": "3441–3453",
    "publisher": "Association for Computational Linguistics",
    "title": "Multilingual molecular representation learning via contrastive pre-training",
    "type": "inproceedings",
    "url": "https://aclanthology.org/2022.acl-long.242",
    "year": 2022
  },
  {
    "abstract": "SMICLR is a contrastive learning framework that leverages both SMILES and graph-based molecular representations for semisupervised and unsupervised learning. The method jointly trains a graph encoder and a SMILES encoder to align multimodal features. Fine-tuned on QM9, it reduces prediction error significantly for energetic and electronic properties. SMICLR also achieves strong performance in unsupervised tasks.",
    "author": "Pinheiro, Gabriel A.; Silva, Juarez L. F. Da; Quiles, Marcos G.",
    "doi": "10.1021/acs.jcim.2c00521",
    "id": "Pinheiro-2022-SMICLR",
    "journal": "Journal of Chemical Information and Modeling",
    "keywords": "SMICLR, contrastive learning, molecular representation, SMILES, graph encoder, semisupervised learning",
    "number": "17",
    "pages": "3948–3960",
    "publisher": "American Chemical Society",
    "title": "SMICLR: Contrastive learning on multiple molecular representations for semisupervised and unsupervised representation learning",
    "type": "article",
    "url": "https://doi.org/10.1021/acs.jcim.2c00521",
    "volume": "62",
    "year": 2022
  },
  {
    "abstract": "MTL-BERT is a multitask learning framework for molecular property prediction that combines large-scale self-supervised pretraining with SMILES enumeration and multitask fine-tuning. It uses contextual information in SMILES strings to improve generalization from limited labeled data. Experimental results on 60 datasets show MTL-BERT outperforms state-of-the-art models. The model’s attention mechanism enhances interpretability by identifying key SMILES features.",
    "author": "Zhang, Xiao-Chen; Wu, Cheng-Kun; Yi, Jia-Cai; Zeng, Xiang-Xiang; Yang, Can-Qun; Lu, Ai-Ping; Hou, Ting-Jun; Cao, Dong-Sheng",
    "doi": "10.34133/research.0004",
    "id": "Zhang-2022-MTLBERT",
    "journal": "Research",
    "keywords": "MTL-BERT, SMILES enumeration, molecular property prediction, multitask learning, deep learning, BERT",
    "note": "Published: 15 Dec 2022",
    "pages": "Article ID: 0004",
    "title": "Pushing the boundaries of molecular property prediction for drug discovery with multitask learning BERT enhanced by SMILES enumeration",
    "type": "article",
    "url": "https://doi.org/10.34133/research.0004",
    "volume": "2022",
    "year": 2022
  },
  {
    "abstract": "X-MOL applies large-scale pre-training on 1.1 billion molecules to enhance molecular representation learning. Using the SMILES format and Baidu’s PaddlePaddle platform, it achieves state-of-the-art performance on molecular property prediction, reaction analysis, drug-drug interaction prediction, de novo molecular generation, and molecule optimization. This study highlights the effectiveness of large-scale pre-training and fine-tuning strategies in unifying molecular analysis tasks and improving their performance.",
    "author": "Xue, Dongyu; Zhang, Han; Chen, Xiaohan; Xiao, Dongling; Gong, Yukang; Chuai, Guohui; Sun, Yu; Tian, Hao; Wu, Hua; Li, Yukun; Liu, Qi",
    "doi": "10.1016/j.scib.2022.01.029",
    "id": "Xue-2022-XMOL",
    "journal": "Science Bulletin",
    "keywords": "X-MOL, Large-scale Pre-training, Molecular Representation Learning, SMILES, Drug Discovery, Machine Learning",
    "number": "9",
    "pages": "899-902",
    "title": "X-MOL: Large-scale pre-training for molecular understanding and diverse molecular analysis",
    "type": "article",
    "url": "https://doi.org/10.1016/j.scib.2022.01.029",
    "volume": "67",
    "year": 2022
  },
  {
    "abstract": "Deep learning-based molecular generation, known as inverse molecular design, is gaining traction in drug design. This study leverages SMILES representations and transformer models inspired by GPT architectures to develop MolGPT, a transformer-decoder trained on a next-token prediction task. MolGPT generates valid, unique, and novel molecules while allowing control over molecular properties and scaffold structures. Saliency maps provide interpretability, highlighting key features in the generative process.",
    "author": "Bagal, Viraj; Aggarwal, Rishal; Vinod, P. K.; Priyakumar, U. Deva",
    "doi": "10.1021/acs.jcim.1c00600",
    "id": "Bagal-2022-MolGPT",
    "journal": "Journal of Chemical Information and Modeling",
    "keywords": "MolGPT, Molecular Generation, Transformer, Deep Learning, Drug Design, SMILES, Generative AI",
    "number": "9",
    "pages": "2064–2076",
    "title": "MolGPT: Molecular Generation Using a Transformer-Decoder Model",
    "type": "article",
    "url": "https://doi.org/10.1021/acs.jcim.1c00600",
    "volume": "62",
    "year": 2022
  },
  {
    "abstract": "This paper introduces Chemformer, a transformer-based model pre-trained for computational chemistry applications. It leverages SMILES representations and self-supervised learning techniques to improve molecular generation, reaction prediction, and property estimation.",
    "author": "Irwin, Ross; Dimitriadis, Spyridon; He, Jiazhen; Bjerrum, Esben Jannik",
    "doi": "10.1088/2632-2153/ac3ffb",
    "id": "Irwin-2022-Chemformer",
    "journal": "Machine Learning: Science and Technology",
    "keywords": "Chemformer, Transformer, Computational Chemistry, SMILES, Self-Supervised Learning, Molecular Generation",
    "number": "1",
    "pages": "015022",
    "title": "Chemformer: a pre-trained transformer for computational chemistry",
    "type": "article",
    "url": "https://doi.org/10.1088/2632-2153/ac3ffb",
    "volume": "3",
    "year": 2022
  },
  {
    "abstract": "Mass spectrometry (MS) is a convenient, highly sensitive, and reliable method for the analysis of complex mixtures, which is vital for materials science, life sciences fields such as metabolomics and proteomics, and mechanistic research in chemistry. Although it is one of the most powerful methods for individual compound detection, complete signal assignment in complex mixtures is still a great challenge. The unconstrained formula-generating algorithm, covering the entire spectra and revealing components, is a \"dream tool\" for researchers. We present the framework for efficient MS data interpretation, describing a novel approach for detailed analysis based on deisotoping performed by gradient-boosted decision trees and a neural network that generates molecular formulas from the fine isotopic structure, approaching the long-standing inverse spectral problem. The methods were successfully tested on three examples: fragment ion analysis in protein sequencing for proteomics, analysis of the natural samples for life sciences, and study of the cross-coupling catalytic system for chemistry.",
    "author": "Boiko, Daniil A.; Kozlov, Konstantin S.; Burykina, J.; Ilyushenkova, V.; Ananikov, V.",
    "doi": "10.1021/jacs.2c03631",
    "id": "Boiko-2022-MSAnalysis",
    "journal": "Journal of the American Chemical Society",
    "number": "32",
    "pages": "14590–14606",
    "title": "Fully Automated Unconstrained Analysis of High-Resolution Mass Spectrometry Data with Machine Learning",
    "type": "article",
    "url": "https://doi.org/10.1021/jacs.2c03631",
    "volume": "144",
    "year": 2022
  },
  {
    "author": "Yilmaz, Melih; Fondrie, William E.; Bittremieux, Wout; Oh, Sewoong; Noble, William S.",
    "booktitle": "Proceedings of the 39th International Conference on Machine Learning",
    "id": "Yilmaz-2022-Casanovo",
    "pages": "25514–25522",
    "title": "De novo mass spectrometry peptide sequencing with a transformer model",
    "type": "conference",
    "url": "https://proceedings.mlr.press/v162/yilmaz22a.html",
    "volume": "162",
    "year": 2022
  },
  {
    "abstract": "Machine learning (ML) profoundly improves the accuracy of the fast DU8+ hybrid density functional theory/parametric computations of nuclear magnetic resonance spectra, allowing for high throughput in silico validation and revision of complex alkaloids and other natural products. Of nearly 170 alkaloids surveyed, 35 structures are revised with the next-generation ML-augmented DU8 method, termed DU8ML.",
    "author": "Novitskiy, Ivan M.; Kutateladze, Andrei G.",
    "doi": "10.1021/acs.joc.2c00169",
    "id": "Novitskiy-2022-DU8ML",
    "journal": "The Journal of Organic Chemistry",
    "number": "7",
    "pages": "4818–4828",
    "title": "DU8ML: Machine Learning-Augmented Density Functional Theory Nuclear Magnetic Resonance Computations for High-Throughput In Silico Solution Structure Validation and Revision of Complex Alkaloids",
    "type": "article",
    "url": "https://doi.org/10.1021/acs.joc.2c00169",
    "volume": "87",
    "year": 2022
  },
  {
    "abstract": "Whenever a new molecule is made, a chemist will justify the proposed structure by analysing the NMR spectra. The widely-used DP4 algorithm will choose the best match from a series of possibilities, but draws no conclusions from a single candidate structure. Here we present the DP5 probability, a step-change in the quantification of molecular uncertainty: given one structure and one 13C NMR spectra, DP5 gives the probability of the structure being correct. We show the DP5 probability can rapidly differentiate between structure proposals indistinguishable by NMR to an expert chemist. We also show in a number of challenging examples the DP5 probability may prevent incorrect structures being published and later reassigned. DP5 will prove extremely valuable in fields such as discovery-driven automated chemical synthesis and drug development. Alongside the DP4-AI package, DP5 can help guide synthetic chemists when resolving the most subtle structural uncertainty. The DP5 system is available at https://github.com/Goodman-lab/DP5.",
    "author": "Howarth, Alexander; Goodman, Jonathan M.",
    "doi": "10.1039/D1SC04406K",
    "id": "Howarth-2022-DP5Probability",
    "journal": "Chemical Science",
    "number": "12",
    "pages": "3507–3518",
    "publisher": "Royal Society of Chemistry (RSC)",
    "title": "The DP5 probability: quantification and visualisation of structural uncertainty in single molecules",
    "type": "article",
    "url": "https://doi.org/10.1039/D1SC04406K",
    "volume": "13",
    "year": 2022
  },
  {
    "abstract": "Nuclear magnetic resonance (NMR) spectroscopy is highly unbiased and reproducible, which provides us a powerful tool to analyze mixtures consisting of small molecules. However, the compound identification in NMR spectra of mixtures is highly challenging because of chemical shift variations of the same compound in different mixtures and peak overlapping among molecules. Here, we present a pseudo-Siamese convolutional neural network method (pSCNN) to identify compounds in mixtures for NMR spectroscopy. A data augmentation method was implemented for the superposition of several NMR spectra sampled from a spectral database with random noises. The augmented dataset was split and used to train, validate and test the pSCNN model. Two experimental NMR datasets (flavor mixtures and additional flavor mixture) were acquired to benchmark its performance in real applications. The results show that the proposed method can achieve good performances in the augmented test set (ACC = 99.80%, TPR = 99.70% and FPR = 0.10%), the flavor mixtures dataset (ACC = 97.62%, TPR = 96.44% and FPR = 2.29%) and the additional flavor mixture dataset (ACC = 91.67%, TPR = 100.00% and FPR = 10.53%). We have demonstrated that the translational invariance of convolutional neural networks can solve the chemical shift variation problem in NMR spectra. In summary, pSCNN is an off-the-shelf method to identify compounds in mixtures for NMR spectroscopy because of its accuracy in compound identification and robustness to chemical shift variation.",
    "author": "Wei, Weiwei; Liao, Yuxuan; Wang, Yufei; Wang, Shaoqi; Du, Wen; Lu, Hongmei; Kong, Bo; Yang, Huawu; Zhang, Zhimin",
    "doi": "10.3390/molecules27123653",
    "id": "Wei-2022-DeepNMR",
    "journal": "Molecules",
    "title": "Deep Learning-Based Method for Compound Identification in NMR Spectra of Mixtures",
    "type": "article",
    "url": "https://doi.org/10.3390/molecules27123653",
    "year": 2022
  },
  {
    "abstract": "1H NMR chemical shifts for 30 organic compounds (396 data points) were predicted using four NMR predictor software and compared with the experimental data from SDBS. The NMR predictors involved were MestReNova, ChemDraw, NMRShiftDB and ACD Workbook Suite. Root mean square deviation (RMSD) and mean absolute percentage error (MAPE) were calculated from the data obtained. One-way analysis of variance (ANOVA), Tukey’s honestly significant difference (HSD) and t-test were carried out to analyse the statistical significance of the differences between the predictors. The results from the statistical analysis were used to predict chemical shifts of three organic compounds.",
    "author": "Mah, Wern Huay; Ahmad Nazuan, Nadzran Hafiy; Yeap, Wei Sheung; Fakharudin, Farah Hasyeena; Faye, Ibrahima; Wilfred, Cecilia Devi",
    "doi": "10.5802/crchim.156",
    "id": "Mah-2022-NMRPredictor",
    "journal": "Comptes Rendus Chimie",
    "number": "G1",
    "pages": "83-95",
    "publisher": "Cellule MathDoc/Centre Mersenne",
    "title": "Deciding which is the best 1H NMR predictor for organic compounds using statistical tools",
    "type": "article",
    "url": "https://doi.org/10.5802/crchim.156",
    "volume": "25",
    "year": 2022
  },
  {
    "abstract": "Rapid progress in machine learning offers new opportunities for the automated analysis of multidimensional NMR spectra ranging from protein NMR to metabolomics applications. Most recently, it has been demonstrated how deep neural networks (DNN) designed for spectral peak picking are capable of deconvoluting highly crowded NMR spectra rivaling the facilities of human experts. Superior DNN-based peak picking is one of a series of critical steps during NMR spectral processing, analysis, and interpretation where machine learning is expected to have a major impact. In this perspective, we lay out some of the unique strengths as well as challenges of machine learning approaches in this new era of automated NMR spectral analysis. Such a discussion seems timely and should help define common goals for the NMR community, the sharing of software tools, standardization of protocols, and calibrate expectations. It will also help prepare for an NMR future where machine learning and artificial intelligence tools will be commonplace.",
    "author": "Li, Da-Wei; Hansen, Alexander L.; Bruschweiler-Li, Lei; Yuan, Chunhua; Brüschweiler, Rafael",
    "doi": "10.1007/s10858-022-00393-1",
    "id": "Li-2022-NMRPeakPicking",
    "journal": "Journal of Biomolecular NMR",
    "number": "3",
    "pages": "49–57",
    "publisher": "Springer",
    "title": "Fundamental and practical aspects of machine learning for the peak picking of biomolecular NMR spectra",
    "type": "article",
    "url": "https://link.springer.com/article/10.1007/s10858-022-00393-1",
    "volume": "76",
    "year": 2022
  },
  {
    "abstract": "Large pretrained models such as GPT-3 have had tremendous impact on modern natural language processing by leveraging self-supervised learning to learn salient representations that can be used to readily finetune on a wide variety of downstream tasks. We investigate the possibility of transferring such advances to molecular machine learning by building a chemical foundation model, ChemBERTa-2, using the language of SMILES. While labeled data for molecular prediction tasks is typically scarce, libraries of SMILES strings are readily available. In this work, we build upon ChemBERTa by optimizing the pretraining process. We compare multi-task and self-supervised pretraining by varying hyperparameters and pretraining dataset size, up to 77M compounds from PubChem. To our knowledge, the 77M set constitutes one of the largest datasets used for molecular pretraining to date. We find that with these pretraining improvements, we are competitive with existing state-of-the-art architectures on the MoleculeNet benchmark suite. We analyze the degree to which improvements in pretraining translate to improvement on downstream tasks.",
    "author": "Ahmad, Walid; Simon, Elana; Chithrananda, Seyone; Grand, Gabriel; Ramsundar, Bharath",
    "booktitle": "ELLIS Machine Learning for Molecule Discovery Workshop 2021",
    "doi": "10.48550/arXiv.2209.01712",
    "id": "Ahmad-2022-ChemBERTa2",
    "journal": "arXiv",
    "keywords": "ChemBERTa-2, chemical foundation models, SMILES, self-supervised learning, molecular pretraining",
    "note": "",
    "number": "2209.01712v1",
    "title": "ChemBERTa-2: Towards chemical foundation models",
    "type": "article",
    "url": "https://arxiv.org/abs/2209.01712",
    "year": 2022
  },
  {
    "abstract": "This study introduces a landscape theory for cell differentiation incorporating proliferation effects, modeling the developmental process as a stochastic dynamical system with a birth-death term. The research identifies two distinct energy landscapes, U and V, that collectively contribute to establishing non-equilibrium steady differentiation. The potential U represents the energy landscape leading to the steady distribution, with its metastable states corresponding to cell types, while V indicates the differentiation direction from pluripotent to differentiated cells. This interpretation differs from previous landscape theories that did not consider proliferation effects. The authors propose numerical methods and a mean-field approximation for constructing these landscapes, demonstrating the validity of energy landscape decomposition through applications to typical biological models.",
    "author": "Shi, Jifan; Aihara, Kazuyuki; Li, Tiejun; Chen, Luonan",
    "doi": "10.1093/nsr/nwac116",
    "id": "Shi-2022-ELDecomposition",
    "journal": "National Science Review",
    "number": "8",
    "pages": "nwac116",
    "title": "Energy landscape decomposition for cell differentiation with proliferation effect",
    "type": "article",
    "url": "https://doi.org/10.1093/nsr/nwac116",
    "volume": "9",
    "year": 2022
  },
  {
    "abstract": "This study proposes a novel framework for metric learning in optimal transport that leverages Riemannian geometry. The proposed method integrates Wasserstein distance to learn spatial structures and relationships more effectively, addressing computational inefficiencies in traditional aggregation approaches. An efficient Sinkhorn approximation and measure coreset are introduced to enhance scalability. Results demonstrate improvements over baseline methods in image retrieval and related applications.",
    "author": "Dou, Jason Xiaotian; Luo, Lei; Yang, Raymond Mingrui",
    "doi": "10.1609/aaai.v36i11.21604",
    "id": "Dou-2022-RiemannianMetricLearningOT",
    "journal": "Proceedings of the AAAI Conference on Artificial Intelligence",
    "keywords": "Optimal Transport; Riemannian Geometry; Metric Learning; Wasserstein Distance; Sinkhorn Approximation",
    "number": "11",
    "pages": "12935–12936",
    "title": "A Riemannian Approach to Ground Metric Learning for Optimal Transport",
    "type": "article",
    "url": "https://doi.org/10.1609/aaai.v36i11.21604",
    "volume": "36",
    "year": 2022
  },
  {
    "abstract": "Capturing visual similarity among images is the core of many computer vision and pattern recognition tasks. This problem can be formulated in a paradigm called metric learning. Most research in the area has focused on improving the loss functions and similarity measures. However, due to ignoring geometric structure, existing methods often lead to suboptimal results. Several recent methods leverage Wasserstein distance to characterize spatial geometry, but batch aggregation hinders its capability and increases computational complexity. This paper proposes a novel Deep Wasserstein Metric Learning framework, employing Wasserstein distance with ranking-based loss functions, such as contrastive loss and triplet loss, to directly compute distances between images, considering finer geometry. A new efficient algorithm using Sinkhorn approximation and Wasserstein measure coreset is introduced, demonstrating significant performance improvements over baselines.",
    "author": "Dou, Jason Xiaotian; Luo, Lei; Yang, Raymond Mingrui",
    "doi": "10.1609/aaai.v36i11.21604",
    "id": "Dou-2022-DeepMetricLearning",
    "journal": "Proceedings of the AAAI Conference on Artificial Intelligence",
    "keywords": "Optimal Transport; Metric Learning; Deep Learning; Sinkhorn Approximation; Wasserstein Distance",
    "number": "11",
    "pages": "12935–12936",
    "title": "An Optimal Transport Approach to Deep Metric Learning",
    "type": "article",
    "url": "https://doi.org/10.1609/aaai.v36i11.21604",
    "volume": "36",
    "year": 2022
  },
  {
    "abstract": "We present a framework for learning Riemannian metrics using optimal transport. The metric tensor is learned from cross-sectional samples of evolving probability measures on a common manifold. By parameterizing the metric tensor as a spatially varying matrix field, we enable nonlinear interpolation between probability measures and computation of geodesics on the manifold. The methodology is demonstrated on simulated data and real-world datasets to validate its flexibility and robustness.",
    "author": "Scarvelis, Christopher; Solomon, Justin",
    "doi": "10.1137/17M1148025",
    "id": "Scarvelis-2022-RiemannianMetricLearning",
    "journal": "SIAM Journal on Applied Algebra and Geometry",
    "keywords": "Riemannian Geometry; Metric Learning; Optimal Transport; Geodesics; Nonlinear Interpolation",
    "number": "4",
    "pages": "597–619",
    "title": "Riemannian Metric Learning via Optimal Transport",
    "type": "article",
    "url": "https://doi.org/10.1137/17M1148025",
    "volume": "2",
    "year": 2022
  },
  {
    "abstract": "High-throughput single-cell molecular profiling is revolutionizing biology and medicine by unveiling the diversity of cell types and states contributing to development and disease. The identification and characterization of cellular heterogeneity are typically achieved through unsupervised clustering, which crucially relies on a similarity metric. We propose the use of Optimal Transport (OT) as a cell–cell similarity metric for single-cell omics data. OT defines distances to compare high-dimensional data represented as probability distributions. To speed up computations and cope with the high dimensionality of single-cell data, we consider the entropic regularization of the classical OT distance. We then extensively benchmark OT against state-of-the-art metrics over 13 independent datasets, including simulated, scRNA-seq, scATAC-seq, and single-cell DNA methylation data. First, we test the ability of the metrics to detect the similarity between cells belonging to the same groups (e.g., cell types, cell lines of origin). Then, we apply unsupervised clustering and test the quality of the resulting clusters. OT is found to improve cell–cell similarity inference and cell clustering in all simulated and real scRNA-seq data, as well as in scATAC-seq and single-cell DNA methylation data.",
    "author": "Huizing, Geert-Jan; Peyré, Gabriel; Cantini, Laura",
    "doi": "10.1093/bioinformatics/btac084",
    "id": "Huizing-2022-OTSimilarity",
    "journal": "Bioinformatics",
    "keywords": "Single-cell omics; Cell-cell similarity; Optimal transport; scRNA-seq; scATAC-seq; DNA methylation",
    "number": "8",
    "pages": "2169–2177",
    "title": "Optimal Transport Improves Cell–Cell Similarity Inference in Single-Cell Omics Data",
    "type": "article",
    "url": "https://doi.org/10.1093/bioinformatics/btac084",
    "volume": "38",
    "year": 2022
  },
  {
    "abstract": "",
    "author": "Demetci, Pinar; Santorella, Rebecca; Sandstede, Björn; Noble, William Stafford; Singh, Ritambhara",
    "doi": "10.1089/cmb.2021.0446",
    "id": "Demetci-2022-SCOT",
    "journal": "Journal of Computational Biology",
    "keywords": "",
    "number": "1",
    "pages": "3–18",
    "title": "SCOT: Single-Cell Multi-Omics Alignment with Optimal Transport",
    "type": "article",
    "url": "https://doi.org/10.1089/cmb.2021.0446",
    "volume": "29",
    "year": 2022
  },
  {
    "abstract": "Building on the connection of optimal transport, gradient flows, and partial differential equations, this work learns an energy potential that explains the continuous differentiation of single cells over time.",
    "author": "Bunne, Charlotte; Papaxanthos, Lefteris; Krause, Andreas; Cuturi, Marco",
    "booktitle": "International Conference on Artificial Intelligence and Statistics (AISTATS)",
    "id": "Bunne-2022-PopulationDynamics",
    "keywords": "Distribution-to-Distribution Regression; Optimal Transport; Population Dynamics; Gradient Flows; Scalable Machine Learning; Single-Cell Dynamics; Energy-Based Models",
    "note": "Full details verified from authoritative sources.",
    "pages": "6511–6528",
    "publisher": "PMLR",
    "title": "Proximal optimal transport modeling of population dynamics",
    "type": "article",
    "url": "https://proceedings.mlr.press/v151/bunne22a.html",
    "year": 2022
  },
  {
    "abstract": "This paper introduces CondOT, a multi-task approach to estimate a family of optimal transport maps conditioned on a context variable, using several pairs of measures tagged with context labels. This method finds applications in predicting cell responses to treatments.",
    "author": "Bunne, Charlotte; Krause, Andreas; Cuturi, Marco",
    "id": "Bunne-2022-ConditionalMonge",
    "journal": "Advances in Neural Information Processing Systems",
    "keywords": "Distribution-to-Distribution Regression; Uncoupled Data Regression; Conditional Optimal Transport; Monge Maps; Scalable Machine Learning; Multi-Task Learning; Probabilistic Regression; Privacy-Preserving Regression; Single-Cell RNA Sequencing; Cell Response Prediction",
    "note": "Full details confirmed from NeurIPS and arXiv sources.",
    "pages": "6859–6872",
    "title": "Supervised training of conditional Monge maps",
    "type": "article",
    "url": "https://papers.nips.cc/paper_files/paper/2022/hash/2d880acd7b31e25d45097455c8e8257f-Abstract-Conference.html",
    "volume": "35",
    "year": 2022
  },
  {
    "abstract": "We present a framework for performing regression when both covariate and response are probability distributions on a compact interval. Our regression model is based on the theory of optimal transportation, and links the conditional Fréchet mean of the response to the covariate via an optimal transport map. We define a Fréchet-least-squares estimator of this regression map, and establish its consistency and rate of convergence to the true map, under both full and partial observations of the regression pairs. Computation of the estimator is shown to reduce to a standard convex optimization problem, and thus our regression model can be implemented with ease. We illustrate our methodology using real and simulated data.",
    "author": "Ghodrati, Laya; Panaretos, Victor M.",
    "doi": "10.1093/biomet/asac005",
    "id": "Ghodrati-2022-OptimalTransport",
    "journal": "Biometrika",
    "keywords": "Distribution-to-Distribution Regression; Optimal Transport; Scalable Machine Learning; Probabilistic Regression",
    "number": "4",
    "pages": "957-974",
    "title": "Distribution-on-distribution regression via optimal transport maps",
    "type": "article",
    "url": "https://doi.org/10.1093/biomet/asac005",
    "volume": "109",
    "year": 2022
  },
  {
    "abstract": "International travel has been recognized as a risk factor contributing to the spread of antimicrobial resistance (AMR). This tool is the first of its kind, developed to help healthcare professionals (HCPs) and international travelers evaluate risk factors, prevalence of carriage, and AMR screening recommendations. It includes a freely accessible, annually updated database and educational resources aimed at minimizing AMR acquisition and spread.",
    "author": "Arieti, Fabiana; Savoldi, Alessia; Rejendran, Nithya Babu; Sibani, Marcella; Tebon, Maela; Pezzani, Maria Diletta; Gorska, Anna; Wozniak, Teresa M; Tacconelli, Evelina",
    "doi": "10.1093/jtm/taac045",
    "id": "Arieti-2022-AMRTravelTool",
    "journal": "Journal of Travel Medicine",
    "number": "4",
    "pages": "taac045",
    "publisher": "Oxford University Press",
    "title": "The antimicrobial resistance travel tool, an interactive evidence-based educational tool to limit antimicrobial resistance spread",
    "type": "article",
    "url": "https://doi.org/10.1093/jtm/taac045",
    "volume": "29",
    "year": 2022
  },
  {
    "abstract": "Human cells produce thousands of lipids that change during cell differentiation and can vary across individual cells of the same type. However, we are only starting to characterize the function of these cell-to-cell differences in lipid composition. Here, we measured the lipidomes and transcriptomes of individual human dermal fibroblasts by coupling high-resolution mass spectrometry imaging with single-cell transcriptomics. We found that the cell-to-cell variations of specific lipid metabolic pathways contribute to the establishment of cell states involved in the organization of skin architecture. Sphingolipid composition is shown to define fibroblast subpopulations, with sphingolipid metabolic rewiring driving cell-state transitions. Therefore, cell-to-cell lipid heterogeneity affects the determination of cell states, adding a new regulatory component to the self-organization of multicellular systems.",
    "author": "Capolupo, Laura; Khven, Irina; Lederer, Alex R.; Mazzeo, Luigi; Glousker, Galina; Ho, Sylvia; Russo, Francesco; Montoya, Jonathan Paz; Bhandari, Dhaka R.; Bowman, Andrew P.; Ellis, Shane R.; Guiet, Romain; Burri, Olivier; Detzner, Johanna; Muthing, Johannes; Homicsko, Krisztian; Kuonen, François; Gilliet, Michel; Spengler, Bernhard; Heeren, Ron M. A.; Dotto, G. Paolo; La Manno, Gioele; D’Angelo, Giovanni",
    "doi": "10.1126/science.abh1623",
    "id": "Capolupo-2022-Sphingolipids",
    "journal": "Science",
    "number": "6590",
    "pages": "Article abh1623",
    "publisher": "American Association for the Advancement of Science",
    "title": "Sphingolipids control dermal fibroblast heterogeneity",
    "type": "article",
    "url": "https://doi.org/10.1126/science.abh1623",
    "volume": "376",
    "year": 2022
  },
  {
    "abstract": "The embryonic neural tube is the origin of the entire adult nervous system, and disturbances in its development cause life-threatening birth defects. However, the study of mammalian neural tube development is limited by the lack of physiologically realistic three-dimensional (3D) in vitro models. Here, we report a self-organizing 3D neural tube organoid model derived from single mouse embryonic stem cells that exhibits an in vivo-like tissue architecture, cell type composition and anterior-posterior (AP) patterning. Moreover, maturation of the neural tube organoids showed the emergence of multipotent neural crest cells and mature neurons. Single-cell transcriptome analyses revealed the sequence of transcriptional events in the emergence of neural crest cells and neural differentiation. Thanks to the accessibility of this model, phagocytosis of migrating neural crest cells could be observed in real time for the first time in a mammalian model. We thus introduce a tractable in vitro model to study some of the key morphogenetic and cell type derivation events during early neural development.",
    "author": "Park, JiSoo; Hsiung, Hao-An; Khven, Irina; La Manno, Gioele; Lutolf, Matthias P.",
    "doi": "10.1242/dev.201052",
    "id": "Park-2022-NeuralTubeOrganoids",
    "journal": "Development",
    "number": "20",
    "pages": "Article dev201052",
    "publisher": "The Company of Biologists",
    "title": "Self-organizing in vitro mouse neural tube organoids mimic embryonic development",
    "type": "article",
    "url": "https://doi.org/10.1242/dev.201052",
    "volume": "149",
    "year": 2022
  },
  {
    "abstract": "Single-cell RNA Velocity has dramatically advanced our ability to model cellular differentiation and cell fate decisions. However, current preprocessing choices and model assumptions often lead to errors in assigning developmental trajectories. Here, we develop Pyro-Velocity, a Bayesian, generative, and multivariate RNA Velocity model to estimate the uncertainty of cell future states. This approach models raw sequencing counts with the synchronized cell time across all expressed genes to provide quantifiable and improved information on cell fate choices and developmental trajectory dynamics.",
    "author": "Qin, Qian; Bingham, Eli; La Manno, Gioele; Langenau, David M.; Pinello, Luca",
    "doi": "10.1101/2022.09.12.507691",
    "id": "Qin-2022-PyroVelocity",
    "journal": "bioRxiv",
    "publisher": "Cold Spring Harbor Laboratory",
    "title": "Pyro-Velocity: Probabilistic RNA Velocity inference from single-cell data",
    "type": "article",
    "url": "https://doi.org/10.1101/2022.09.12.507691",
    "year": 2022
  },
  {
    "abstract": "This study profiles the molecular differentiation of retinal pigment epithelial cells derived from stem cells, establishing protocols for clinical translation to treat age-related macular degeneration.",
    "author": "Petrus-Reurer, Sandra; Lederer, Alex R.; Baqué-Vidal, Laura; Douagi, Iyadh; Pannagel, Belinda; Khven, Irina; Aronsson, Monica; Bartuma, Hammurabi; Wagner, Magdalena; Wrona, Andreas; Efstathopoulos, Paschalis; Jaberi, Elham; Willenbrock, Hanni; Shimizu, Yutaka; Villaescusa, J. Carlos; André, Helder; Sundstrøm, Erik; Bhaduri, Aparna; Kriegstein, Arnold; Kvanta, Anders; La Manno, Gioele; Lanner, Fredrik",
    "doi": "10.1016/j.stemcr.2022.05.005",
    "id": "PetrusReurer-2022-RPEProfiling",
    "journal": "Stem Cell Reports",
    "number": "6",
    "pages": "1458-1475",
    "publisher": "Elsevier BV",
    "title": "Molecular profiling of stem cell-derived retinal pigment epithelial cell differentiation established for clinical translation",
    "type": "article",
    "url": "https://doi.org/10.1016/j.stemcr.2022.05.005",
    "volume": "17",
    "year": 2022
  },
  {
    "abstract": "This study presents AMPDeep, a model utilizing transfer learning to predict the hemolytic activity of antimicrobial peptides. By fine-tuning a large transformer-based protein language model on a small peptide dataset, AMPDeep achieves state-of-the-art performance across three benchmarks, addressing challenges associated with limited data in deep learning applications.",
    "author": "Salem, Milad; Arshadi, Arash Keshavarzi; Yuan, Jiann Shiun",
    "doi": "10.1186/s12859-022-04952-z",
    "id": "Salem-2022-AMPDeep",
    "journal": "BMC Bioinformatics",
    "keywords": "AMPDeep, antimicrobial peptides, hemolytic activity, transfer learning, protein language models",
    "number": "389",
    "title": "AMPDeep: Hemolytic Activity Prediction of Antimicrobial Peptides Using Transfer Learning",
    "type": "article",
    "url": "https://doi.org/10.1186/s12859-022-04952-z",
    "volume": "23",
    "year": 2022
  },
  {
    "abstract": "ColabFold accelerates protein structure prediction using MMseqs2 integrated with AlphaFold2 or RoseTTAFold. It is a free, open-source tool optimized for fast predictions, capable of folding thousands of proteins daily on standard hardware.",
    "author": "Mirdita, Milot; Schütze, Konstantin; Moriwaki, Yoshitaka; Heo, Lim; Ovchinnikov, Sergey; Steinegger, Martin",
    "doi": "10.1038/s41592-022-01488-1",
    "id": "Mirdita-2022-ColabFold",
    "journal": "Nature Methods",
    "keywords": "protein folding, ColabFold, AlphaFold, RoseTTAFold, MMseqs2",
    "title": "ColabFold: Making protein folding accessible to all",
    "type": "article",
    "url": "https://doi.org/10.1038/s41592-022-01488-1",
    "year": 2022
  },
  {
    "abstract": "This study proposes embedding-based alignments (EBA) to leverage protein language models for detecting structural similarities in the twilight zone. The method demonstrates strong performance compared to classical sequence alignment tools.",
    "author": "Pantolini, Lorenzo; Studer, Gabriel; Pereira, Joana; Durairaj, Janani; Schwede, Torsten",
    "doi": "10.1093/bioinformatics/btad786",
    "id": "Pantolini-2022-EBA",
    "journal": "Bioinformatics",
    "keywords": "protein language models, embedding-based alignment, structural similarities, twilight-zone, bioinformatics",
    "title": "Embedding-Based Alignment: Combining Protein Language Models and Alignment Approaches to Detect Structural Similarities in the Twilight-Zone",
    "type": "article",
    "url": "https://doi.org/10.1093/bioinformatics/btad786",
    "year": 2022
  },
  {
    "abstract": "Focuses on the use of pre-trained language models for generating embeddings and training a convolutional neural network to predict antimicrobial peptides (AMPs). The results demonstrated predictive accuracies of 93.33% and 88.26% on validation datasets, surpassing previous methods.",
    "author": "Dee, William",
    "doi": "10.1093/bioadv/vbac021",
    "id": "Dee-2022-LMPred",
    "journal": "Bioinformatics Advances",
    "keywords": "antimicrobial peptides, protein language models, deep learning, peptide prediction, LMPred",
    "number": "1",
    "title": "LMPred: Predicting Antimicrobial Peptides Using Pre-Trained Language Models and Deep Learning",
    "type": "article",
    "url": "https://doi.org/10.1093/bioadv/vbac021",
    "volume": "2",
    "year": 2022
  },
  {
    "abstract": "This study introduces ProteinBERT, a deep language model specifically designed for proteins. The pretraining scheme combines language modeling with Gene Ontology (GO) annotation prediction. The model's architecture includes both local and global representations, allowing end-to-end processing of various inputs and outputs. ProteinBERT achieves near state-of-the-art performance on multiple benchmarks covering diverse protein properties, including structure, post-translational modifications, and biophysical attributes, despite being smaller and faster than competing deep-learning methods.",
    "author": "Brandes, Nadav; Ofer, Dan; Peleg, Yam; Rappoport, Nadav; Linial, Michal",
    "doi": "10.1093/bioinformatics/btac020",
    "id": "Brandes-2022-ProteinBERT",
    "journal": "Bioinformatics",
    "keywords": "ProteinBERT, protein sequence, protein function, Gene Ontology, deep learning",
    "number": "8",
    "pages": "2102-2110",
    "title": "ProteinBERT: a universal deep-learning model of protein sequence and function",
    "type": "article",
    "url": "https://academic.oup.com/bioinformatics/article/38/8/2102/6502274",
    "volume": "38",
    "year": 2022
  },
  {
    "abstract": "This study demonstrates that language models, trained solely on natural protein sequences, can generate de novo proteins. The authors focus on two design tasks: fixed backbone design, where the structure is specified, and unconstrained generation, where the structure is sampled from the model. Remarkably, despite being trained only on sequences, the models are capable of designing structures. A total of 228 generated proteins were evaluated experimentally, with a high success rate of 67% in producing soluble and monomeric species.",
    "author": "Verkuil, Robert; Kabeli, Ori; Du, Yilun; Wicky, Basile I. M.; Milles, Lukas F.; Dauparas, Justas; Baker, David; Ovchinnikov, Sergey; Sercu, Tom; Rives, Alexander",
    "doi": "10.1101/2022.12.21.521521",
    "id": "Verkuil-2022-LanguageModelsBeyond",
    "journal": "bioRxiv",
    "keywords": "protein language models, de novo protein design, structural biology, fixed backbone design",
    "title": "Language models generalize beyond natural proteins",
    "type": "article",
    "url": "https://www.biorxiv.org/content/10.1101/2022.12.21.521521v1",
    "year": 2022
  },
  {
    "abstract": "This paper introduces OmegaFold, a computational method that predicts high-resolution protein structures directly from single primary sequences without relying on multiple sequence alignments (MSAs). OmegaFold combines a protein language model with a geometry-inspired transformer model, achieving prediction accuracy comparable to AlphaFold2 on recently released structures.",
    "author": "Wu, Ruidong; Ding, Fan; Wang, Rui; Shen, Rui; Zhang, Xiwen; Luo, Shitong; Su, Chenpeng; Wu, Zuofan; Xie, Qi; Berger, Bonnie; Ma, Jianzhu; Peng, Jian",
    "doi": "10.1101/2022.07.21.500999",
    "id": "Wu-2022-OmegaFold",
    "journal": "bioRxiv",
    "keywords": "OmegaFold, protein structure prediction, de novo, transformer model, MSA-free",
    "title": "High-resolution de novo structure prediction from primary sequence",
    "type": "article",
    "year": 2022
  },
  {
    "abstract": "This paper introduces ProtGPT2, a deep unsupervised language model trained on protein sequences. The model generates de novo protein sequences that exhibit properties similar to natural proteins, including appropriate amino acid propensities and structural features.",
    "author": "Ferruz, Noelia; Schmidt, Steffen; Höcker, Birte",
    "doi": "10.1038/s41467-022-32007-7",
    "id": "Ferruz-2022-ProtGPT2",
    "journal": "Nature Communications",
    "keywords": "ProtGPT2, protein design, deep learning, de novo proteins, unsupervised learning",
    "number": "4348",
    "pages": "1-12",
    "title": "ProtGPT2 is a deep unsupervised language model for protein design",
    "type": "article",
    "volume": "13",
    "year": 2022
  },
  {
    "abstract": "ProtTrans presents a suite of protein language models trained on large protein sequence datasets, including Transformer-XL, XLNet, BERT, Albert, Electra, and T5. These models capture biophysical features of proteins and excel in tasks like secondary structure prediction and subcellular localization. Notably, ProtTrans achieves state-of-the-art results without relying on evolutionary data or costly database searches. These findings demonstrate that protein language models can effectively learn patterns in protein sequences. Models are available at https://github.com/agemagician/ProtTrans.",
    "author": "Elnaggar, Ahmed; Heinzinger, Michael; Dallago, Christian; Rehawi, Ghalia; Wang, Yu; Jones, Llion; Gibbs, Tom; Feher, Tamas; Angerer, Christoph; Steinegger, Martin; Bhowmik, Debsindhu; Rost, Burkhard",
    "doi": "10.1109/TPAMI.2021.3095381",
    "id": "Elnaggar-2022-ProtTrans",
    "journal": "IEEE Transactions on Pattern Analysis and Machine Intelligence",
    "keywords": "ProtTrans, protein language models, Transformer-XL, XLNet, BERT, Albert, Electra, T5, secondary structure prediction, subcellular localization, biophysical properties",
    "number": "10",
    "pages": "7112-7127",
    "title": "ProtTrans: Toward understanding the language of life through self-supervised learning",
    "type": "article",
    "volume": "44",
    "year": 2022
  },
  {
    "abstract": "This study evaluates chemical language models based on recurrent neural networks for low-data chemical space exploration, showing that robust models can be trained with limited examples through strategies like SMILES augmentation, and highlighting robust metrics for generative model evaluation.",
    "author": "Skinnider, Michael A.; Stacey, R. Greg; Wishart, David S.; Foster, Leonard J.",
    "doi": "10.1038/s42256-021-00368-1",
    "id": "Skinnider-2021-ChemLang",
    "journal": "Nature Machine Intelligence",
    "pages": "759–770",
    "title": "Chemical language models enable navigation in sparsely populated chemical space",
    "type": "article",
    "url": "https://www.nature.com/articles/s42256-021-00368-1",
    "volume": "3",
    "year": 2021
  },
  {
    "abstract": "DU8+ NMR computations reveal frequent misassignments in natural products featuring carboxylic anhydrides, particularly due to neighboring hydroxy groups forming lactones. Ten structural revisions are presented.",
    "author": "Novitskiy, Ivan M.; Kutateladze, Andrei G.",
    "doi": "10.1021/acs.joc.1c02291",
    "id": "Novitskiy-2021-DU8Plus",
    "journal": "The Journal of Organic Chemistry",
    "number": "23",
    "pages": "17511–17515",
    "title": "DU8+ computations reveal a common challenge in the structure assignment of natural products containing a carboxylic anhydride moiety",
    "type": "article",
    "url": "https://doi.org/10.1021/acs.joc.1c02291",
    "volume": "86",
    "year": 2021
  },
  {
    "abstract": "CReSS is a deep contrastive learning framework for cross-modal retrieval between 13C NMR spectra and molecular structures. It enables structure search using NMR spectra without requiring large paired libraries. Evaluated on a dataset of 41,494 spectra and 10.4M structures, it achieved up to 98.39% recall@10 with molecular weight filtering.",
    "author": "Yang, Zhuo; Song, Jianfei; Yang, Minjian; Yao, Lin; Zhang, Jiahua; Shi, Hui; Ji, Xiangyang; Deng, Yafeng; Wang, Xiaojian",
    "doi": "10.1021/acs.analchem.1c04307",
    "id": "Yang-2021-CrossModal",
    "journal": "Analytical Chemistry",
    "number": "50",
    "pages": "16947–16955",
    "title": "Cross-modal retrieval between {¹³C} NMR spectra and structures for compound identification using deep contrastive learning",
    "type": "article",
    "url": "https://doi.org/10.1021/acs.analchem.1c04307",
    "volume": "93",
    "year": 2021
  },
  {
    "abstract": "Deep learning has revolutionized many tasks in image, speech, and language processing. However, many applications require handling data from non-Euclidean domains represented as graphs. Graph neural networks (GNNs) address this challenge by extending deep learning techniques to graphs. This survey provides a structured overview of GNNs, categorizing them into recurrent GNNs, convolutional GNNs, graph autoencoders, and spatial-temporal GNNs. It also reviews GNN applications, open-source tools, datasets, and evaluation methods, and suggests future research directions.",
    "author": "Wu, Zonghan; Pan, Shirui; Chen, Fengwen; Long, Guodong; Zhang, Chengqi; Yu, Philip S.",
    "doi": "10.1109/TNNLS.2020.2978386",
    "id": "Wu-2021-GNNSurvey",
    "journal": "IEEE transactions on neural networks and learning systems",
    "keywords": "Deep learning; graph neural networks; GNN; graph representation learning; network embedding; graph convolutional networks; graph autoencoder",
    "number": "1",
    "pages": "4-24",
    "title": "A comprehensive survey on graph neural networks",
    "type": "article",
    "url": "https://doi.org/10.1109/TNNLS.2020.2978386",
    "volume": "32",
    "year": 2021
  },
  {
    "abstract": "We study statistical inference and distributionally robust solution methods for stochastic optimization problems, focusing on confidence intervals for optimal values and solutions that achieve exact coverage asymptotically. We develop a generalized empirical likelihood framework—based on distributional uncertainty sets constructed from nonparametric f-divergence balls—for Hadamard differentiable functionals, and in particular, stochastic optimization problems. As consequences of this theory, we provide a principled method for choosing the size of distributional uncertainty regions to provide one- and two-sided confidence intervals that achieve exact coverage. We also give an asymptotic expansion for our distributionally robust formulation, showing how robustification regularizes problems by their variance. Finally, we show that optimizers of the distributionally robust formulations we study enjoy (essentially) the same consistency properties as those in classical sample average approximations. Our general approach applies to quickly mixing stationary sequences, including geometrically ergodic Harris recurrent Markov chains.",
    "author": "Duchi, John C.; Glynn, Peter W.; Namkoong, Hongseok",
    "doi": "10.1287/moor.2020.1085",
    "id": "Duchi-2021-GELDRO",
    "journal": "Mathematics of Operations Research",
    "pages": "946-969",
    "title": "Statistics of robust optimization: A generalized empirical likelihood approach",
    "type": "article",
    "url": "https://doi.org/10.1287/moor.2020.1085",
    "volume": "46",
    "year": 2021
  },
  {
    "abstract": "Proposes a Communicative Message Passing Transformer (CoMPT) that integrates node–edge message interactions using a diffusion mechanism within a Transformer architecture. Unlike fully connected transformer-style GNNs, CoMPT preserves molecular graph structure to prevent message oversaturation. It achieves ~4% average improvement over state-of-the-art models on seven graph-level and two node-level molecular property benchmarks.",
    "author": "Chen, Jianwen; Zheng, Shuangjia; Song, Ying; Rao, Jiahua; Yang, Yuedong",
    "doi": "10.24963/ijcai.2021/309",
    "id": "Chen-2021-CoMPT",
    "journal": "Proceedings of the Thirtieth International Joint Conference on Artificial Intelligence (IJCAI-21)",
    "pages": "2242–2248",
    "title": "Learning attributed graph representations with communicative message passing transformer",
    "type": "article",
    "url": "https://www.ijcai.org/proceedings/2021/0309.pdf",
    "year": 2021
  },
  {
    "abstract": "UniT is a Unified Transformer architecture designed for simultaneous learning across multiple tasks and modalities, using a shared encoder-decoder structure with task-specific output heads. It handles seven tasks over eight datasets using shared parameters, achieving strong performance with high efficiency. Code available at https://mmf.sh.",
    "author": "Hu, Ronghang; Singh, Amanpreet",
    "doi": "10.1109/ICCV48922.2021.00147",
    "id": "Hu-2021-UniT",
    "journal": "2021 IEEE/CVF International Conference on Computer Vision (ICCV)",
    "pages": "1419-1429",
    "title": "UniT: Multimodal multitask learning with a unified transformer",
    "type": "article",
    "url": "https://doi.org/10.1109/ICCV48922.2021.00147",
    "year": 2021
  },
  {
    "abstract": "Molecular property prediction is crucial for drug discovery. This paper introduces Mol-BERT, an end-to-end deep learning framework that uses a pretrained BERT model to generate molecular substructure embeddings. The model is trained on four million unlabeled drug SMILES (ZINC 15 & ChEMBL 27) and fine-tuned on Tox21, SIDER, and ClinTox datasets. Results show a 2% ROC-AUC score improvement over existing sequence-based methods.",
    "author": "Li, Juncai; Jiang, Xiaofei",
    "doi": "10.1155/2021/7181815",
    "id": "Li-2021-MolBERT",
    "journal": "Wireless Communications and Mobile Computing",
    "keywords": "Molecular Representation Learning, BERT, SMILES, Drug Discovery, Deep Learning",
    "pages": "Article ID 7181815, 7 pages",
    "title": "Mol-BERT: An Effective Molecular Representation with BERT for Molecular Property Prediction",
    "type": "article",
    "url": "https://doi.org/10.1155/2021/7181815",
    "volume": "2021",
    "year": 2021
  },
  {
    "abstract": "Fourier transform infrared spectroscopy (FTIR) is a ubiquitous spectroscopic technique. Spectral interpretation is a time-consuming process, but it yields important information about functional groups present in compounds and in complex substances. We develop a generalizable model via a machine learning (ML) algorithm using convolutional neural networks (CNNs) to identify the presence of functional groups in gas-phase FTIR spectra. The ML models reduce the amount of time required to analyze functional groups and facilitate interpretation of FTIR spectra. Through web scraping, we acquire intensity-frequency data from 8728 gas-phase organic molecules within the NIST spectral database and transform the data into spectral images. We successfully train models for 15 of the most common organic functional groups, which we then determine via identification from previously untrained spectra. These models serve to expand the application of FTIR measurements for facile analysis of organic samples. Our approach was done such that we have broad functional group models that infer in tandem to provide full interpretation of a spectrum. We present the first implementation of ML using image-based CNNs for predicting functional groups from a spectroscopic method.",
    "author": "Enders, Abigail A.; North, Nicole M.; Fensore, Chase M.; Velez-Alvarez, Juan; Allen, Heather C.",
    "doi": "10.1021/acs.analchem.1c00867",
    "id": "Enders-2021-FTIRML",
    "journal": "Analytical Chemistry",
    "number": "28",
    "pages": "9711–9718",
    "title": "Functional Group Identification for FTIR Spectra Using Image-Based Machine Learning Models",
    "type": "article",
    "url": "https://doi.org/10.1021/acs.analchem.1c00867",
    "volume": "93",
    "year": 2021
  },
  {
    "abstract": "Metabolomics using nontargeted tandem mass spectrometry can detect thousands of molecules in a biological sample. However, structural molecule annotation is limited to structures present in libraries or databases, restricting analysis and interpretation of experimental data. Here we describe CANOPUS (class assignment and ontology prediction using mass spectrometry), a computational tool for systematic compound class annotation. CANOPUS uses a deep neural network to predict 2,497 compound classes from fragmentation spectra, including all biologically relevant classes. CANOPUS explicitly targets compounds for which neither spectral nor structural reference data are available and predicts classes lacking tandem mass spectrometry training data. In evaluation using reference data, CANOPUS reached very high prediction performance (average accuracy of 99.7% in cross-validation) and outperformed four baseline methods. We demonstrate the broad utility of CANOPUS by investigating the effect of microbial colonization in the mouse digestive system, through analysis of the chemodiversity of different Euphorbia plants and regarding the discovery of a marine natural product, revealing biological insights at the compound class level.",
    "author": "Dührkop, Kai; Nothias, Louis-Félix; Fleischauer, Markus; Reher, Raphael; Ludwig, Marcus; Hoffmann, Martin A.; Petras, Daniel; Gerwick, William H.; Rousu, Juho; Dorrestein, Pieter C.; Böcker, Sebastian",
    "doi": "10.1038/s41587-020-0740-8",
    "id": "Dührkop-2021-MetaboliteClassification",
    "journal": "Nature Biotechnology",
    "number": "4",
    "pages": "462–471",
    "title": "Systematic classification of unknown metabolites using high-resolution fragmentation mass spectra",
    "type": "article",
    "url": "https://www.nature.com/articles/s41587-020-0740-8",
    "volume": "39",
    "year": 2021
  },
  {
    "abstract": "A machine learning model and graph generator were able to accurately predict for the presence of nearly 1000 substructures and the connectivity of small organic molecules from experimental 1D NMR data.",
    "author": "Huang, Zhaorui; Chen, Michael S.; Woroch, Cristian P.; Markland, Thomas E.; Kanan, Matthew W.",
    "doi": "10.1039/d1sc04105c",
    "id": "Huang-2021-NMRFramework",
    "journal": "Chemical Science",
    "number": "46",
    "pages": "15329-15338",
    "publisher": "Royal Society of Chemistry",
    "title": "A framework for automated structure elucidation from routine NMR spectra",
    "type": "article",
    "url": "https://doi.org/10.1039/d1sc04105c",
    "volume": "12",
    "year": 2021
  },
  {
    "abstract": "Phase and baseline corrections are important processing steps in the analysis of NMR spectra, and as a consequence, many different approaches have been developed in the past to carry out these tasks automatically. While these methods perform generally well, the performance of many of them suffers when applied to spectra with high signal densities such as e.g. proton spectra. Here, we introduce a deep learning-based method for phase and baseline correction of 1D 1H NMR spectra. We show that this method represents a major step forward compared to previously available TopSpin solutions. The algorithm provides consistently better correction of phase and baseline both for low- and high-field spectra, even reaching human-level quality results in phase correction accuracy. The new method is available in TopSpin as command apbk starting from version 4.1.3, and it marks a further step towards the fully automated analysis of NMR spectra.",
    "author": "Bruderer, Samuel; Paruzzo, Francesco; Bolliger, Christoph",
    "id": "Bruderer-2021-PhaseBaseline",
    "journal": "Bruker Whitepaper",
    "note": "Whitepaper available from Bruker.",
    "title": "Deep Learning-Based Phase and Baseline Correction of 1D 1H NMR Spectra",
    "type": "article",
    "url": "https://www.bruker.com/content/dam/bruker/int/en/resources/bbio/magnetic-resonance/application-notes/T186209_Bruker%20Whitepaper%20Deep%20Learning%20in%20NMR.pdf",
    "year": 2021
  },
  {
    "abstract": "Optimal transport (OT) distances between probability distributions are parameterized by the ground metric they use between observations. Their relevance for real-life applications strongly hinges on whether that ground metric parameter is suitably chosen. The challenge of selecting it adaptively and algorithmically from prior knowledge, the so-called ground metric learning (GML) problem, has therefore appeared in various settings. In this paper, we consider the GML problem when the learned metric is constrained to be a geodesic distance on a graph that supports the measures of interest. This imposes a rich structure for candidate metrics, but also enables far more efficient learning procedures when compared to a direct optimization over the space of all metric matrices. We use this setting to tackle an inverse problem stemming from the observation of a density evolving with time; we seek a graph ground metric such that the OT interpolation between the starting and ending densities that result from that ground metric agrees with the observed evolution. This OT dynamic framework is relevant to model natural phenomena exhibiting displacements of mass, such as the evolution of the color palette induced by the modification of lighting and materials.",
    "author": "Heitz, Matthieu; Bonneel, Nicolas; Coeurjolly, David; Cuturi, Marco; Peyré, Gabriel",
    "doi": "10.1007/s10851-020-00996-z",
    "id": "Heitz-2021-GroundMetricGraphs",
    "journal": "Journal of Mathematical Imaging and Vision",
    "number": "1",
    "pages": "89–107",
    "title": "Ground metric learning on graphs",
    "type": "article",
    "url": "https://doi.org/10.1007/s10851-020-00996-z",
    "volume": "63",
    "year": 2021
  },
  {
    "abstract": "This paper examines energy-like landscapes for complex living systems, utilizing cellular automata governed by verbal rules to address the parameter problem in biological modeling. It introduces predictive landscapes described by Lyapunov functions as a promising strategy for representing complex cellular dynamics.",
    "author": "Koopmans, Lars; Youk, Hyun",
    "doi": "10.1007/s10867-021-09592-7",
    "id": "Koopmans-2021-PredictiveLandscapes",
    "journal": "Journal of Biological Physics",
    "number": "4",
    "pages": "355-369",
    "title": "Predictive landscapes hidden beneath biological cellular automata",
    "type": "article",
    "url": "https://doi.org/10.1007/s10867-021-09592-7",
    "volume": "47",
    "year": 2021
  },
  {
    "abstract": "This paper explores the connections between stochastic control theory and optimal transport via the Schrödinger bridge problem, highlighting its applications to both Richard Sinkhorn's algorithm and Gaspard Monge's formulation of transport. The authors delve into mathematical foundations and discuss computational implementations relevant to applied mathematics and engineering.",
    "author": "Chen, Yongxin; Georgiou, Tryphon T.; Pavon, Michele",
    "doi": "10.1137/20m1339982",
    "id": "Chen-2021-SinkhornBridge",
    "journal": "SIAM Review",
    "keywords": "Stochastic Control, Schrödinger Bridge, Optimal Transport, Sinkhorn Algorithm, Monge Formulation",
    "pages": "249-313",
    "title": "Stochastic Control Liaisons: Richard Sinkhorn Meets Gaspard Monge on a Schrödinger Bridge",
    "type": "article",
    "url": "https://doi.org/10.1137/20m1339982",
    "volume": "63",
    "year": 2021
  },
  {
    "abstract": "Explores single-cell similarity metrics using optimal transport, introducing the 'gene mover’s distance' to quantify similarities between single-cell profiles. Demonstrated on large-scale datasets, this approach effectively identifies gene expression variations and relationships.",
    "author": "Bellazzi, Riccardo; Codegoni, Andrea; Gualandi, Stefano; Nicora, Giovanna; Vercesi, Eleonora",
    "doi": "10.48550/arXiv.2102.01218",
    "id": "Bellazzi-2021-GeneMoversDistance",
    "journal": "arXiv",
    "keywords": "Single-cell analysis; Gene expression; Optimal Transport; Gene mover’s distance",
    "number": "",
    "pages": "",
    "title": "The gene mover’s distance: Single-cell similarity via optimal transport",
    "type": "article",
    "url": "https://arxiv.org/abs/2102.01218",
    "volume": "",
    "year": 2021
  },
  {
    "abstract": "Single-cell RNA-Seq (scRNA-seq) is invaluable for studying biological systems. Dimensionality reduction is crucial in interpreting the relation between cells in scRNA-seq data. Current methods are confounded by technical and biological variability, resulting in 'crowding' or inadequate temporal relationship capture. ScPhere, a scalable deep generative model, embeds cells into low-dimensional hyperspherical or hyperbolic spaces to accurately represent scRNA-seq data, addressing batch factors, cell crowding, and uncovering temporal trajectories. Demonstrated on nine datasets, ScPhere facilitates data interpretation by generating batch-invariant embeddings, mapping new individuals, inferring spatial positions, and highlighting cellular relations.",
    "author": "Ding, Jiarui; Regev, Aviv",
    "doi": "10.1038/s41467-021-22851-4",
    "id": "Ding-2021-ScRNAHypersphere",
    "journal": "Nature Communications",
    "keywords": "single-cell RNA-seq, hyperspheres, hyperbolic spaces, dimensionality reduction, deep generative model",
    "number": "1",
    "pages": "2554",
    "title": "Deep generative model embedding of single-cell RNA-Seq profiles on hyperspheres and hyperbolic spaces",
    "type": "article",
    "url": "https://doi.org/10.1038/s41467-021-22851-4",
    "volume": "12",
    "year": 2021
  },
  {
    "abstract": "Understanding how cells change their identity and behaviour in living systems is an important question in many fields of biology. The problem of inferring cell trajectories from single-cell measurements has been a major topic in the single-cell analysis community, with different methods developed for equilibrium and non-equilibrium systems (e.g., haematopoiesis vs. embryonic development). We show that optimal transport analysis, a technique originally designed for analysing time-courses, may also be applied to infer cellular trajectories from a single snapshot of a population in equilibrium. Therefore, optimal transport provides a unified approach to inferring trajectories, applicable to both stationary and non-stationary systems. Our method, StationaryOT, is mathematically motivated in a natural way from the hypothesis of a Waddington’s epigenetic landscape. We implement StationaryOT as a software package and demonstrate its efficacy in applications to simulated data as well as single-cell data from Arabidopsis thaliana root development.",
    "author": "Zhang, Stephen; Afanassiev, Anton; Greenstreet, Laura; Matsumoto, Tetsuya; Schiebinger, Geoffrey",
    "doi": "10.1371/journal.pcbi.1009466",
    "id": "Zhang-2021-OptimalTransportSteadyState",
    "journal": "PLOS Computational Biology",
    "keywords": "Optimal transport; Single-cell analysis; Cell trajectories; Stationary systems; Waddington’s landscape; Arabidopsis thaliana; Steady-state dynamics",
    "number": "12",
    "pages": "e1009466",
    "title": "Optimal Transport Analysis Reveals Trajectories in Steady-State Systems",
    "type": "article",
    "url": "https://doi.org/10.1371/journal.pcbi.1009466",
    "volume": "17",
    "year": 2021
  },
  {
    "abstract": "The study reviews the acquisition, carriage, and dissemination of AMR bacteria by international travelers. It highlights that approximately 30% of travelers return with acquired AMR bacteria. Key risk factors include the travel destination, use of antimicrobials, and episodes of travelers' diarrhea (TD). Genomic analyses identify the AMR genes acquired and spread by travelers. Emphasis is placed on genomic surveillance to better understand and mitigate the role of international travel in the global spread of AMR.",
    "author": "Sridhar, Sushmita; Turbett, Sarah E.; Harris, Jason B.; LaRocque, Regina C.",
    "doi": "10.1097/qco.0000000000000751",
    "id": "Sridhar-2021-AMRTravelers",
    "journal": "Current Opinion in Infectious Diseases",
    "number": "5",
    "pages": "423-431",
    "publisher": "Ovid Technologies (Wolters Kluwer Health)",
    "title": "Antimicrobial-resistant bacteria in international travelers",
    "type": "article",
    "url": "https://journals.lww.com/10.1097/QCO.0000000000000751",
    "volume": "34",
    "year": 2021
  },
  {
    "abstract": "This article introduces resistancebank.org, an online repository that aggregates data from 1,285 surveys of antimicrobial resistance in animals across LMICs. The platform provides country-level reports and geospatial maps of AMR at high resolution, allowing for enhanced policy planning and intervention prioritization.",
    "author": "Criscuolo, Nicola G.; Pires, João; Zhao, Cheng; Van Boeckel, Thomas P.",
    "doi": "10.1038/s41597-021-00978-9",
    "id": "Criscuolo-2021-ResistanceBank",
    "journal": "Scientific Data",
    "number": "1",
    "pages": "Article 189",
    "publisher": "Springer Science and Business Media LLC",
    "title": "resistancebank.org, an open-access repository for surveys of antimicrobial resistance in animals",
    "type": "article",
    "url": "https://doi.org/10.1038/s41597-021-00978-9",
    "volume": "8",
    "year": 2021
  },
  {
    "abstract": "This study introduces HMD-AMP, a hierarchical multi-label deep forest framework powered by protein language models to comprehensively annotate antimicrobial peptides (AMPs). It predicts AMP targets across eleven classes and outperforms state-of-the-art methods, particularly in small, underrepresented classes. The framework improves macro-AUROC by 11%, demonstrating robustness against reduced features and perturbations.",
    "author": "Yu, Qinze; Dong, Zhihang; Fan, Xingyu; Zong, Licheng; Li, Yu",
    "doi": "10.1101/2021.11.10.468157",
    "id": "Yu-2021-HMDAMP",
    "journal": "bioRxiv",
    "keywords": "HMD-AMP, antimicrobial peptides, hierarchical multi-label, protein language models, deep learning",
    "title": "HMD-AMP: Protein language-powered hierarchical multi-label deep forest for annotating antimicrobial peptides",
    "type": "article",
    "url": "https://doi.org/10.1101/2021.11.10.468157",
    "year": 2021
  },
  {
    "abstract": "Proposes Bayesian latent class mixture models for analyzing antimicrobial resistance data, particularly addressing challenges such as censoring.",
    "author": "Zhang, Min",
    "doi": "10.31274/etd-20210609-210",
    "id": "Zhang-2021-BayesianLatentAMR",
    "journal": "Iowa State University Digital Repository",
    "keywords": "Bayesian models, latent class, antimicrobial resistance, data analysis, censoring",
    "title": "Bayesian Latent Class Mixture Models for Antimicrobial Resistance Data with Censoring",
    "type": "dissertation",
    "url": "http://dx.doi.org/10.31274/etd-20210609-210",
    "year": 2021
  },
  {
    "abstract": "The article details the development of AlphaFold, a neural network-based model capable of predicting protein structures with atomic accuracy, addressing a longstanding challenge in computational biology.",
    "author": "Jumper, John; Evans, Richard; Pritzel, Alexander; Green, Tim; Figurnov, Michael; Ronneberger, Olaf; Tunyasuvunakool, Kathryn; Bates, Russ; Žídek, Augustin; Potapenko, Anna; Bridgland, Alex; Meyer, Clemens; Kohl, Simon A. A.; Ballard, Andrew J.; Cowie, Andrew; Romera-Paredes, Bernardino; Nikolov, Stanislav; Jain, Rishub; Adler, Jonas; Back, Trevor; Petersen, Stig; Reiman, David; Clancy, Ellen; Zielinski, Michal; Steinegger, Martin; Pacholska, Michalina; Berghammer, Tamas; Bodenstein, Sebastian; Silver, David; Vinyals, Oriol; Senior, Andrew W.; Kavukcuoglu, Koray; Kohli, Pushmeet; Hassabis, Demis",
    "doi": "10.1038/s41586-021-03819-2",
    "id": "Jumper-2021-AlphaFold",
    "journal": "Nature",
    "keywords": "AlphaFold, protein structure prediction, neural networks, computational biology",
    "pages": "583-589",
    "title": "Highly accurate protein structure prediction with AlphaFold",
    "type": "article",
    "volume": "596",
    "year": 2021
  },
  {
    "abstract": "Protein structures and intermolecular interactions are critical for understanding biological function and mechanisms. A major challenge in computational structural biology is the accurate prediction of protein structures and interactions. This study demonstrates a three-track neural network architecture that integrates sequence information, protein structure, and inter-protein contacts to predict protein complexes with high accuracy. This approach significantly advances the field of protein modeling and provides a valuable tool for the biological research community.",
    "author": "Minkyung Baek; Frank DiMaio; Ivan Anishchenko; Justas Dauparas; Sergey Ovchinnikov; and others",
    "doi": "10.1126/science.abj8754",
    "id": "Baek-2021-ThreeTrack",
    "journal": "Science",
    "number": "6557",
    "pages": "871--876",
    "title": "Accurate prediction of protein structures and interactions using a three-track neural network",
    "type": "article",
    "url": "https://www.science.org/doi/10.1126/science.abj8754",
    "volume": "373",
    "year": 2021
  },
  {
    "abstract": "This review summarizes applications of deep learning (DL) in NMR spectroscopy and presents a forward-looking perspective on DL's potential to transform NMR into a more efficient and powerful analytical tool in chemistry and life sciences.",
    "author": "Chen, Dicheng; Wang, Zi; Guo, Di; Orekhov, Vladislav; Qu, Xiaobo",
    "doi": "10.1002/chem.202000246",
    "id": "Chen-2020-DeepNMRReview",
    "journal": "Chemistry – A European Journal",
    "number": "46",
    "pages": "10391–10401",
    "title": "Review and prospect: Deep learning in nuclear magnetic resonance spectroscopy",
    "type": "article",
    "url": "https://doi.org/10.1002/chem.202000246",
    "volume": "26",
    "year": 2020
  },
  {
    "abstract": "Lots of learning tasks require dealing with graph data which contains rich relation information among elements. Modeling physics systems, learning molecular fingerprints, predicting protein interface, and classifying diseases demand a model to learn from graph inputs. In other domains such as learning from non-structural data like texts and images, reasoning on extracted structures (like the dependency trees of sentences and the scene graphs of images) is an important research topic which also needs graph reasoning models. Graph neural networks (GNNs) are neural models that capture the dependence of graphs via message passing between the nodes of graphs. In recent years, variants of GNNs such as graph convolutional network (GCN), graph attention network (GAT), graph recurrent network (GRN) have demonstrated ground-breaking performances on many deep learning tasks. In this survey, we propose a general design pipeline for GNN models and discuss the variants of each component, systematically categorize the applications, and propose four open problems for future research.",
    "author": "Zhou, Jie; Cui, Ganqu; Hu, Shengding; Zhang, Zhengyan; Yang, Cheng; Liu, Zhiyuan; Wang, Lifeng; Li, Changcheng; Sun, Maosong",
    "doi": "10.1016/j.aiopen.2021.01.001",
    "id": "Zhou-2020-GNNReview",
    "journal": "AI Open",
    "keywords": "graph neural networks; GNN; survey; message passing; GCN; GAT; GRN; graph reasoning",
    "pages": "57-81",
    "title": "Graph neural networks: A review of methods and applications",
    "type": "article",
    "url": "https://doi.org/10.1016/j.aiopen.2021.01.001",
    "volume": "1",
    "year": 2020
  },
  {
    "abstract": "The discovery of novel materials and functional molecules can help to solve some of society’s most urgent challenges, ranging from efficient energy harvesting and storage to uncovering novel pharmaceutical drug candidates. Traditionally matter engineering–generally denoted as inverse design–was based massively on human intuition and high-throughput virtual screening. The last few years have seen the emergence of significant interest in computer-inspired designs based on evolutionary or deep learning methods. The major challenge here is that the standard strings molecular representation SMILES shows substantial weaknesses in that task because large fractions of strings do not correspond to valid molecules. Here, we solve this problem at a fundamental level and introduce SELFIES (SELF-referencIng Embedded Strings), a string-based representation of molecules which is 100% robust. Every SELFIES string corresponds to a valid molecule, and SELFIES can represent every molecule. SELFIES can be directly applied in arbitrary machine learning models without the adaptation of the models; each of the generated molecule candidates is valid. In our experiments, the model’s internal memory stores two orders of magnitude more diverse molecules than a similar test with SMILES. Furthermore, as all molecules are valid, it allows for explanation and interpretation of the internal working of the generative models.",
    "author": "Krenn, Mario; Häse, Florian; Nigam, AkshatKumar; Friederich, Pascal; Aspuru-Guzik, Alan",
    "doi": "10.1088/2632-2153/aba947",
    "id": "Krenn-2020-SELFIES",
    "journal": "Machine Learning: Science and Technology",
    "note": "Open access",
    "number": "4",
    "pages": "045024",
    "publisher": "IOP Publishing",
    "title": "Self-referencing embedded strings (SELFIES): A robust molecular string representation",
    "type": "article",
    "url": "https://doi.org/10.1088/2632-2153/aba947",
    "volume": "1",
    "year": 2020
  },
  {
    "abstract": "DistilGPT2 is a smaller, faster, and lighter version of GPT-2 created by Hugging Face via knowledge distillation. It retains 97% of GPT-2’s performance while being 60% faster and smaller.",
    "author": "Hugging~Face",
    "id": "HuggingFace-2020-DistilGPT2",
    "note": "Model card for DistilGPT2, a distilled version of OpenAI's GPT-2 model",
    "publisher": "Hugging Face",
    "title": "DistilGPT2 model card",
    "type": "software",
    "url": "https://huggingface.co/distilbert/distilgpt2",
    "year": 2020
  },
  {
    "abstract": "The paper applies Transformer-based models (BERT) to molecular representation learning, incorporating domain-specific auxiliary tasks to enhance representation quality. Their model, MolBERT, improves drug discovery benchmarks.",
    "author": "Fabian, Benedek; Edlich, Thomas; Gaspar, Héléna; Segler, Marwin; Meyers, Joshua; Fiscato, Marco; Ahmed, Mohamed",
    "doi": "",
    "id": "Fabian-2020-MolBERT",
    "journal": "Machine Learning for Molecules Workshop, NeurIPS 2020",
    "keywords": "Molecular Representation Learning, Language Models, Drug Discovery, Transformer, Auxiliary Tasks",
    "title": "Molecular Representation Learning with Language Models and Domain-Relevant Auxiliary Tasks",
    "type": "article",
    "url": "https://arxiv.org/abs/2011.13230",
    "year": 2020
  },
  {
    "abstract": "Here are presented technical notes and tips on developing graph generative models for molecular design. Although this work stems from the development of GraphINVENT, a Python platform for iterative molecular generation using graph neural networks, this work is relevant to researchers studying other architectures for graph-based molecular design. In this work, technical details that could be of interest to researchers developing their own molecular generative models are discussed, including an overview of previous work in graph-based molecular design and strategies for designing new models. Advice on development and debugging tools which are helpful during code development is also provided. Finally, methods that were tested but which ultimately did not lead to promising results in the development of GraphINVENT are described here in the hope that this will help other researchers avoid pitfalls in development and instead focus their efforts on more promising strategies for graph-based molecular generation.",
    "author": "Mercado, Rocío; Rastemo, Tobias; Lindelöf, Edvard; Klambauer, Günter; Engkvist, Ola; Chen, Hongming; Bjerrum, Esben Jannik",
    "doi": "10.1002/ail2.18",
    "id": "Mercado-2020-MolecularGraph",
    "journal": "Applied AI Letters",
    "number": "2",
    "title": "Practical notes on building molecular graph generative models",
    "type": "article",
    "url": "https://doi.org/10.1002/ail2.18",
    "volume": "1",
    "year": 2020
  },
  {
    "abstract": "State-of-the-art identification of the functional groups present in an unknown chemical entity requires the expertise of a skilled spectroscopist to analyse and interpret Fourier transform infra-red (FTIR), mass spectroscopy (MS) and/or nuclear magnetic resonance (NMR) data. This process can be time-consuming and error-prone, especially for complex chemical entities that are poorly characterised in the literature, or inefficient to use with synthetic robots producing molecules at an accelerated rate. Herein, we introduce a fast, multi-label deep neural network for accurately identifying all the functional groups of unknown compounds using a combination of FTIR and MS spectra. We do not use any database, pre-established rules, procedures, or peak-matching methods. Our trained neural network reveals patterns typically used by human chemists to identify standard groups. Finally, we experimentally validated our neural network, trained on single compounds, to predict functional groups in compound mixtures. Our methodology showcases practical utility for future use in autonomous analytical detection.",
    "author": "Fine, Jonathan A.; Rajasekar, Anand A.; Jethava, Krupal P.; Chopra, Gaurav",
    "doi": "10.1039/C9SC06240H",
    "id": "Fine-2020-SpectralDL",
    "journal": "Chemical Science",
    "pages": "4618-4630",
    "title": "Spectral deep learning for prediction and prospective validation of functional groups",
    "type": "article",
    "url": "https://doi.org/10.1039/C9SC06240H",
    "volume": "11",
    "year": 2020
  },
  {
    "abstract": "Translation of the findings in basic science and clinical research into routine practice is hampered by large variations in human phenotype. Developments in genotyping and phenotyping, such as proteomics and lipidomics, are beginning to address these limitations. In this work, we developed a new methodology for rapid, label-free molecular phenotyping of biological fluids (e.g., blood) by exploiting the recent advances in fast and highly efficient multidimensional inverse Laplace decomposition technique. We demonstrated that using two-dimensional T1-T2 correlational spectroscopy on a single drop of blood (<5 μL), a highly time- and patient-specific ‘molecular fingerprint’ can be obtained in minutes. Machine learning techniques were introduced to transform the NMR correlational map into user-friendly information for point-of-care disease diagnostic and monitoring. The clinical utilities of this technique were demonstrated through the direct analysis of human whole blood in various physiological (e.g., oxygenated/deoxygenated states) and pathological (e.g., blood oxidation, hemoglobinopathies) conditions.",
    "author": "Peng, Weng Kung; Ng, Tian-Tsong; Loh, Tze Ping",
    "doi": "10.1038/s42003-020-01262-z",
    "id": "Peng-2020-MLBlood",
    "journal": "Communications Biology",
    "number": "1",
    "title": "Machine learning assistive rapid, label-free molecular phenotyping of blood with two-dimensional NMR correlational spectroscopy",
    "type": "article",
    "url": "https://doi.org/10.1038/s42003-020-01262-z",
    "volume": "3",
    "year": 2020
  },
  {
    "abstract": "A robust system for automatic processing and assignment of raw 13C and 1H NMR data DP4-AI has been developed and integrated into our computational organic molecule structure elucidation workflow. Starting from a molecular structure with undefined stereochemistry or other structural uncertainty, this system allows for completely automated structure elucidation. Methods for NMR peak picking using objective model selection and algorithms for matching the calculated 13C and 1H NMR shifts to peaks in noisy experimental NMR data were developed. DP4-AI achieved a 60-fold increase in processing speed, and near-elimination of the need for scientist time, when rigorously evaluated using a challenging test set of molecules. DP4-AI represents a leap forward in NMR structure elucidation and a step-change in the functionality of DP4. It enables high-throughput analyses of databases and large sets of molecules, which were previously impossible, and paves the way for the discovery of new structural information through machine learning. This new functionality has been coupled with an intuitive GUI and is available as open-source software at https://github.com/KristapsE/DP4-AI.",
    "author": "Howarth, Alexander; Ermanis, Kristaps; Goodman, Jonathan M.",
    "doi": "10.1039/D0SC00442A",
    "id": "Howarth-2020-DP4AI",
    "journal": "Chemical Science",
    "number": "17",
    "pages": "4351-4359",
    "title": "DP4-AI automated NMR data analysis: straight from spectrometer to structure",
    "type": "article",
    "url": "https://doi.org/10.1039/D0SC00442A",
    "volume": "11",
    "year": 2020
  },
  {
    "abstract": "Accurate prediction of NMR chemical shifts at affordable computational cost is very important for different types of structural assignments in experimental studies. Density functional theory (DFT) and gauge-including atomic orbital (GIAO) are two of the most popular computational methods for NMR calculation, yet, they often fail to resolve ambiguities in structural assignments. Here, we present a new method that uses machine learning (ML) techniques (DFT+ML that significantly increases the accuracy of 13C/1H NMR chemical shift prediction for a variety of organic molecules. The input of the generalizable DFT+ML model contains two critical parts: one is a vector providing insights into chemical environments, which can be evaluated without knowing the exact geometry of the molecule; the other one is the DFT calculated isotropic shielding constant. The DFT+ML model was trained with a dataset containing 476 13C and 270 1H experimental chemical shifts. For the DFT methods used here, the root mean square deviations (RMSDs) for the errors between predicted and experimental 13C/1H chemical shifts can be as small as 2.10/0.18 ppm, which is much lower than those from simple DFT (5.54/0.25 ppm), or DFT+linear regression (LR) (4.77/0.23 ppm) approaches. It also has a smaller maximum absolute error than two previously proposed NMR-predicting ML models. The robustness of the DFT+ML model is tested on two classes of organic molecules (TIC10 and hyacinthacines), where the correct isomers were unambiguously assigned to the experimental ones. Overall, the DFT+ML model is showing promise for structural assignments in a variety of systems, including stereoisomers, that are often challenging to determine experimentally.",
    "author": "Gao, Peng; Zhang, Jun; Peng, Qian; Zhang, Jie; Glezakou, Vassiliki-Alexandra",
    "doi": "10.1021/acs.jcim.0c00388",
    "id": "Gao-2020-MLNMR",
    "journal": "Journal of Chemical Information and Modeling",
    "number": "8",
    "pages": "3746–3754",
    "title": "General Protocol for the Accurate Prediction of Molecular 13C/1H NMR Chemical Shifts via Machine Learning Augmented DFT",
    "type": "article",
    "url": "https://doi.org/10.1021/acs.jcim.0c00388",
    "volume": "60",
    "year": 2020
  },
  {
    "author": "Zhang, Jinzhe; Terayama, Kei; Sumita, Masato; Yoshizoe, Kazuki; Ito, Kengo; Kikuchi, Jun; Tsuda, Koji",
    "doi": "10.1080/14686996.2020.1793382",
    "id": "Zhang-2020-NMR-TS",
    "journal": "Science and Technology of Advanced Materials",
    "number": "1",
    "pages": "552-561",
    "title": "NMR-TS: de novo molecule identification from NMR spectra",
    "type": "article",
    "url": "https://doi.org/10.1080/14686996.2020.1793382",
    "volume": "21",
    "year": 2020
  },
  {
    "abstract": "Machine learning (ML) methods have been present in the field of NMR since decades, but it has experienced a tremendous growth in the last few years, especially thanks to the emergence of deep learning (DL) techniques taking advantage of the increased amounts of data and available computer power. These algorithms are successfully employed for classification, regression, clustering, or dimensionality reduction tasks of large data sets and have been intensively applied in different areas of NMR including metabonomics, clinical diagnosis, or relaxometry. In this article, we concentrate on the various applications of ML/DL in the areas of NMR signal processing and analysis of small molecules, including automatic structure verification and prediction of NMR observables in solution.",
    "author": "Cobas, Carlos",
    "doi": "10.1002/mrc.4989",
    "id": "Cobas-2020-NMRSignalProcessing",
    "journal": "Magnetic Resonance in Chemistry",
    "number": "6",
    "pages": "512–519",
    "publisher": "Wiley",
    "title": "NMR signal processing, prediction, and structure verification with machine learning techniques",
    "type": "article",
    "url": "https://doi.org/10.1002/mrc.4989",
    "volume": "58",
    "year": 2020
  },
  {
    "author": "Huber, Florian; Verhoeven, Stefan; Meijer, Christiaan; Spreeuw, Hanno; Castilla, Efraín; Geng, Cunliang; van der Hooft, Justin; Rogers, Simon; Belloum, Adam; Diblen, Faruk; Spaaks, Jurriaan",
    "doi": "10.21105/joss.02411",
    "id": "Huber-2020-MatchMS",
    "journal": "Journal of Open Source Software",
    "number": "52",
    "pages": "2411",
    "title": "matchms - processing and similarity evaluation of mass spectrometry data",
    "type": "article",
    "url": "https://doi.org/10.21105/joss.02411",
    "volume": "5",
    "year": 2020
  },
  {
    "abstract": "GNNs and chemical fingerprints are the predominant approaches to representing molecules for property prediction. However, in NLP, transformers have become the de-facto standard for representation learning thanks to their strong downstream task transfer. In parallel, the software ecosystem around transformers is maturing rapidly, with libraries like HuggingFace and BertViz enabling streamlined training and introspection. In this work, we make one of the first attempts to systematically evaluate transformers on molecular property prediction tasks via our ChemBERTa model. ChemBERTa scales well with pretraining dataset size, offering competitive downstream performance on MoleculeNet and useful attention-based visualization modalities. Our results suggest that transformers offer a promising avenue of future work for molecular representation learning and property prediction. To facilitate these efforts, we release a curated dataset of 77M SMILES from PubChem suitable for large-scale self-supervised pretraining.",
    "author": "Chithrananda, Seyone; Grand, Gabriel; Ramsundar, Bharath",
    "booktitle": "Machine Learning for Molecules Workshop at NeurIPS 2020",
    "doi": "10.48550/arXiv.2010.09885",
    "id": "Chithrananda-2020-ChemBERTa",
    "keywords": "ChemBERTa, molecular machine learning, transformers, SMILES, self-supervised learning, pretraining",
    "title": "ChemBERTa: Large-Scale Self-Supervised Pretraining for Molecular Property Prediction",
    "type": "conference",
    "url": "https://ml4molecules.github.io/papers2020/ML4Molecules_2020_paper_67.pdf",
    "year": 2020
  },
  {
    "abstract": "Domain adaptation aims at benefiting from a labeled dataset drawn from a source distribution to learn a model from examples generated from a different but related target distribution. This paper introduces a framework using Optimal Transport (OT) to align source and target distributions by optimizing the ground metric, leading to reduced target risk. The proposed algorithm (MLOT) optimizes a Mahalanobis distance to enhance the transportation plan for domain adaptation, with experiments showing improved performance.",
    "author": "Kerdoncuff, Tanguy; Emonet, Rémi; Sebban, Marc",
    "doi": "10.24963/ijcai.2020/299",
    "id": "Kerdoncuff-2020-MLOT",
    "journal": "Proceedings of the Twenty-Ninth International Joint Conference on Artificial Intelligence (IJCAI-PRICAI 2020)",
    "keywords": "Optimal Transport; Domain Adaptation; Metric Learning; Mahalanobis Distance; MLOT",
    "number": "",
    "pages": "2162–2168",
    "title": "Metric Learning in Optimal Transport for Domain Adaptation",
    "type": "conference",
    "url": "https://doi.org/10.24963/ijcai.2020/299",
    "volume": "",
    "year": 2020
  },
  {
    "abstract": "Single-cell RNA sequencing (scRNA-seq) provides details for individual cells; however, crucial spatial information is often lost. We present SpaOTsc, a method relying on structured optimal transport to recover spatial properties of scRNA-seq data by utilizing spatial measurements of a relatively small number of genes. A spatial metric for individual cells in scRNA-seq data is first established based on a map connecting it with the spatial measurements. The cell–cell communications are then obtained by “optimally transporting” signal senders to target signal receivers in space. Using partial information decomposition, we next compute the intercellular gene–gene information flow to estimate the spatial regulations between genes across cells. Four datasets are employed for cross-validation of spatial gene expression prediction and comparison to known cell–cell communications. SpaOTsc has broader applications, both in integrating non-spatial single-cell measurements with spatial data, and directly in spatial single-cell transcriptomics data to reconstruct spatial cellular dynamics in tissues.",
    "author": "Cang, Zixuan; Nie, Qing",
    "doi": "10.1038/s41467-020-15968-5",
    "id": "Cang-2020-SpaOTsc",
    "journal": "Nature Communications",
    "keywords": "Single-cell RNA sequencing; Spatial transcriptomics; Cell–cell communication; Optimal transport; Gene–gene information flow; SpaOTsc",
    "number": "1",
    "pages": "2084",
    "title": "Inferring Spatial and Signaling Relationships Between Cells From Single Cell Transcriptomic Data",
    "type": "article",
    "url": "https://doi.org/10.1038/s41467-020-15968-5",
    "volume": "11",
    "year": 2020
  },
  {
    "abstract": "We present TrajectoryNet, a dynamic optimal transport framework for modeling cellular dynamics from snapshot data. TrajectoryNet learns an optimal transport map that describes cellular transitions over time, enabling the prediction of future cellular states and the inference of developmental trajectories. We demonstrate the effectiveness of TrajectoryNet on single-cell RNA sequencing data, capturing complex cellular behaviors and providing insights into underlying biological processes.",
    "author": "Tong, Alexander; Huang, Jessie; Wolf, Guy; Van Dijk, David; Krishnaswamy, Smita",
    "booktitle": "37th International Conference on Machine Learning (ICML)",
    "id": "Tong-2020-TrajectoryNet",
    "keywords": "Distribution-to-Distribution Regression; Optimal Transport; Scalable Machine Learning; Cellular Dynamics; Probabilistic Regression",
    "pages": "9526–9536",
    "publisher": "PMLR",
    "title": "TrajectoryNet: A Dynamic Optimal Transport Network for Modeling Cellular Dynamics",
    "type": "conference",
    "url": "https://proceedings.mlr.press/v119/tong20a.html",
    "year": 2020
  },
  {
    "abstract": "SE(3)-Transformers introduce a novel attention mechanism that is equivariant to SE(3) transformations, enabling accurate and efficient processing of 3D data such as molecular structures. This method demonstrates significant improvements in tasks requiring roto-translation invariance.",
    "author": "Fuchs, Fabian B.; Worrall, Daniel E.; Fischer, Volker; Welling, Max",
    "id": "Fuchs-2020-SE3Transformers",
    "journal": "Advances in Neural Information Processing Systems",
    "pages": "1970-1981",
    "title": "SE(3)-Transformers: 3D Roto-Translation Equivariant Attention Networks",
    "type": "article",
    "url": "https://arxiv.org/abs/2006.10503",
    "volume": "33",
    "year": 2020
  },
  {
    "abstract": "Accurate calculation of specific spectral properties for NMR is an important step for molecular structure elucidation. Here we report the development of a novel machine learning technique for accurately predicting chemical shifts of both 1H and 13C nuclei which exceeds DFT-accessible accuracy for 13C and 1H for a subset of nuclei, while being orders of magnitude more performant. Our method produces estimates of uncertainty, allowing for robust and confident predictions, and suggests future avenues for improved performance.",
    "author": "Jonas, Eric; Kuhn, Stefan",
    "doi": "10.1186/s13321-019-0374-3",
    "id": "Jonas-2019-NMRUncertainty",
    "journal": "Journal of Cheminformatics",
    "pages": "Article number: 50",
    "title": "Rapid prediction of NMR spectral properties with quantified uncertainty",
    "type": "article",
    "url": "https://jcheminf.biomedcentral.com/articles/10.1186/s13321-019-0374-3",
    "volume": "11",
    "year": 2019
  },
  {
    "abstract": "This work introduces a neural machine translation-inspired method that learns molecular descriptors by translating between different molecular representations. A deep neural network is trained to extract a low-dimensional vector that captures the shared semantics between SMILES and other encodings. These learned descriptors are shown to be competitive with graph-based models and outperform traditional molecular fingerprints in QSAR and virtual screening tasks, offering a consistent, continuous representation space that enables structure recovery and compound optimization.",
    "author": "Winter, Robin; Montanari, Floriane; Noé, Frank; Clevert, Djork-Arné",
    "doi": "10.1039/c8sc04175j",
    "id": "Winter-2019-MolecularDescriptors",
    "journal": "Chemical Science",
    "number": "6",
    "pages": "1692–1701",
    "title": "Learning continuous and data-driven molecular descriptors by translating equivalent chemical representations",
    "type": "article",
    "url": "https://doi.org/10.1039/c8sc04175j",
    "volume": "10",
    "year": 2019
  },
  {
    "abstract": "This paper introduces a novel variational autoencoder (VAE) that encodes multiple SMILES strings per molecule using stacked RNNs and atom-level pooling to produce a unified latent representation. It decodes to different SMILES variants and achieves near-bijective mappings, advancing molecular property regression and optimization.",
    "author": "Alperstein, Zaccary; Cherkasov, Artem; Rolfe, Jason T.",
    "id": "Alperstein-2019-AllSMILES",
    "journal": "arXiv",
    "note": "Preprint on arXiv",
    "title": "All SMILES Variational Autoencoder",
    "type": "article",
    "url": "https://arxiv.org/abs/1905.13343",
    "year": 2019
  },
  {
    "abstract": "The goal of regression and classification methods in supervised learning is to minimize the empirical risk, that is, the expectation of some loss function quantifying the prediction error under the empirical distribution. When facing scarce training data, overfitting is typically mitigated by adding regularization terms to the objective that penalize hypothesis complexity. In this paper we introduce new regularization techniques using ideas from distributionally robust optimization, and we give new probabilistic interpretations to existing techniques. Specifically, we propose to minimize the worst-case expected loss, where the worst case is taken over the ball of all (continuous or discrete) distributions that have a bounded transportation distance from the (discrete) empirical distribution. By choosing the radius of this ball judiciously, we can guarantee that the worst-case expected loss provides an upper confidence bound on the loss on test data, thus offering new generalization bounds. We prove that the resulting regularized learning problems are tractable and can be tractably kernelized for many popular loss functions. The proposed approach to regluarization is also extended to neural networks. We validate our theoretical out-of-sample guarantees through simulated and empirical experiments.",
    "author": "Shafieezadeh-Abadeh, Soroosh; Kuhn, Daniel; Mohajerin Esfahani, Peyman",
    "id": "ShafieezadehAbadeh-2019-RMT",
    "journal": "Journal of Machine Learning Research",
    "pages": "1-68",
    "title": "Regularization via mass transportation",
    "type": "article",
    "url": "https://www.jmlr.org/papers/v20/17-633.html",
    "volume": "20",
    "year": 2019
  },
  {
    "abstract": "We show that several machine learning estimators, including square-root least absolute shrinkage and selection and regularized logistic regression, can be represented as solutions to distributionally robust optimization problems. The associated uncertainty regions are based on suitably defined Wasserstein distances. Hence, our representations allow us to view regularization as a result of introducing an artificial adversary that perturbs the empirical distribution to account for out-of-sample effects in loss estimation. In addition, we introduce RWPI (robust Wasserstein profile inference), a novel inference methodology which extends the use of methods inspired by empirical likelihood to the setting of optimal transport costs (of which Wasserstein distances are a particular case). We use RWPI to show how to optimally select the size of uncertainty regions, and as a consequence we are able to choose regularization parameters for these machine learning estimators without the use of cross validation. Numerical experiments are also given to validate our theoretical findings.",
    "author": "Blanchet, Jose; Kang, Yang; Murthy, Karthyek",
    "doi": "10.1017/jpr.2019.49",
    "id": "Blanchet-2019-RWPI",
    "journal": "Journal of Applied Probability",
    "pages": "830-857",
    "title": "Robust Wasserstein profile inference and applications to machine learning",
    "type": "article",
    "url": "https://doi.org/10.1017/jpr.2019.49",
    "volume": "56",
    "year": 2019
  },
  {
    "abstract": "We introduce a new language representation model called BERT, which stands for Bidirectional Encoder Representations from Transformers. Unlike recent language representation models (Peters et al., 2018a; Radford et al., 2018), BERT is designed to pre-train deep bidirectional representations from unlabeled text by jointly conditioning on both left and right context in all layers. As a result, the pre-trained BERT model can be fine-tuned with just one additional output layer to create state-of-the-art models for a wide range of tasks, such as question answering and language inference, without substantial task-specific architecture modifications. BERT is conceptually simple and empirically powerful. It obtains new state-of-the-art results on eleven natural language processing tasks, including pushing the GLUE score to 80.5 (7.7 point absolute improvement), MultiNLI accuracy to 86.7% (4.6% absolute improvement), SQuAD v1.1 question answering Test F1 to 93.2 (1.5 point absolute improvement) and SQuAD v2.0 Test F1 to 83.1 (5.1 point absolute improvement).",
    "author": "Devlin, Jacob; Chang, Ming-Wei; Lee, Kenton; Toutanova, Kristina",
    "booktitle": "Proceedings of the 2019 Conference of the North American Chapter of the Association for Computational Linguistics: Human Language Technologies, Volume 1 (Long and Short Papers)",
    "doi": "10.18653/v1/N19-1423",
    "id": "Devlin-2019-BERT",
    "keywords": "BERT, language model, pretraining, transformers, NLP, NAACL",
    "note": "Best Long Paper",
    "pages": "4171–4186",
    "publisher": "Association for Computational Linguistics",
    "title": "BERT: pre-training of deep bidirectional transformers for language understanding",
    "type": "inproceedings",
    "url": "https://aclanthology.org/N19-1423/",
    "year": 2019
  },
  {
    "abstract": "Language model pretraining has led to significant performance gains but careful comparison between different approaches is challenging. Training is computationally expensive, often done on private datasets of different sizes, and, as we will show, hyperparameter choices have significant impact on the final results. We present a replication study of BERT pretraining (Devlin et al., 2019) that carefully measures the impact of many key hyperparameters and training data size. We find that BERT was significantly undertrained, and can match or exceed the performance of every model published after it. Our best model achieves state-of-the-art results on GLUE, RACE and SQuAD. These results highlight the importance of previously overlooked design choices, and raise questions about the source of recently reported improvements. We release our models and code.",
    "author": "Liu, Yinhan; Ott, Myle; Goyal, Naman; Du, Jingfei; Joshi, Mandar; Chen, Danqi; Levy, Omer; Lewis, Mike; Zettlemoyer, Luke; Stoyanov, Veselin",
    "doi": "10.48550/arXiv.1907.11692",
    "id": "Liu-2019-RoBERTa",
    "journal": "arXiv",
    "keywords": "RoBERTa, BERT, language model, pretraining, NLP, transformer",
    "number": "1907.11692",
    "title": "RoBERTa: a robustly optimized BERT pretraining approach",
    "type": "article",
    "url": "https://arxiv.org/abs/1907.11692",
    "year": 2019
  },
  {
    "abstract": "This monograph reviews optimal transport (OT) with emphasis on computational aspects and applications in data science. It presents OT as a problem of comparing probability distributions with a geometric structure induced by transport cost. After tracing historical developments from Monge and Kantorovich to modern solvers, it surveys efficient numerical methods and algorithmic innovations. Applications across imaging, computer graphics, and machine learning are discussed, alongside extensions to kernel methods, statistical inference, and information theory. All results are supported with reproducible code and resources from a companion website.",
    "author": "Peyré, Gabriel; Cuturi, Marco",
    "doi": "10.1561/2200000073",
    "id": "Peyre-2019-CompOT",
    "journal": "Foundations and Trends in Machine Learning",
    "number": "5-6",
    "pages": "355–607",
    "title": "Computational optimal transport: With applications to data science",
    "type": "book",
    "url": "https://doi.org/10.1561/2200000073",
    "volume": "11",
    "year": 2019
  },
  {
    "abstract": "We note that common implementations of adaptive gradient algorithms, such as Adam, limit the potential benefit of weight decay regularization, because the weights do not decay multiplicatively (as would be expected for standard weight decay) but by an additive constant factor. We propose a simple way to resolve this issue by decoupling weight decay and the optimization steps taken w.r.t. the loss function. We provide empirical evidence that our proposed modification (i) decouples the optimal choice of weight decay factor from the setting of the learning rate for both standard SGD and Adam, and (ii) substantially improves Adam's generalization performance, allowing it to compete with SGD with momentum on image classification datasets (on which it was previously typically outperformed by the latter). We also demonstrate that longer optimization runs require smaller weight decay values for optimal results and introduce a normalized variant of weight decay to reduce this dependence. Finally, we propose a version of Adam with warm restarts (AdamWR) that has strong anytime performance while achieving state-of-the-art results on CIFAR-10 and ImageNet32x32. Our source code will become available after the review process.",
    "author": "Loshchilov, Ilya; Hutter, Frank",
    "doi": "10.48550/arXiv.1711.05101",
    "id": "Loshchilov-2019-DecoupledWD",
    "keywords": "weight decay, Adam optimizer, SGD, regularization, deep learning",
    "note": "arXiv preprint; published as a conference paper at ICLR 2019",
    "title": "Fixing weight decay regularization in Adam",
    "type": "article",
    "url": "https://arxiv.org/abs/1711.05101",
    "year": 2019
  },
  {
    "abstract": "SMILES-BERT is a semi-supervised model for molecular property prediction that uses a Transformer-based architecture pre-trained on large-scale unlabeled data through a Masked SMILES Recovery task. The pre-trained model is then fine-tuned for downstream tasks. Experimental results show that SMILES-BERT outperforms state-of-the-art methods on three benchmark datasets, demonstrating the effectiveness of unsupervised pre-training and strong generalization performance.",
    "author": "Wang, Sheng; Guo, Yuzhi; Wang, Yuhong; Sun, Hongmao; Huang, Junzhou",
    "doi": "10.1145/3307339.3342186",
    "id": "Wang-2019-SMILESBERT",
    "journal": "Proceedings of the 10th ACM International Conference on Bioinformatics, Computational Biology and Health Informatics (BCB '19)",
    "pages": "429–436",
    "title": "SMILES-BERT: Large scale unsupervised pre-training for molecular property prediction",
    "type": "article",
    "url": "https://doi.org/10.1145/3307339.3342186",
    "year": 2019
  },
  {
    "abstract": "Recurrent neural networks (RNNs) trained with SMILES strings can generate large chemical spaces. We benchmarked models trained on GDB-13 subsets using canonical, randomized, and DeepSMILES variants, with LSTM and GRU cells across various hyperparameters. Models trained with 1M randomized SMILES and LSTM generalized best, covering almost all of GDB-13 with uniform probability. Randomized SMILES also improved models trained on smaller datasets and ChEMBL, doubling the number of unique generated molecules compared to canonical SMILES.",
    "author": "Arús-Pous, Josep; Johansson, Simon Viet; Prykhodko, Oleksii; Bjerrum, Esben Jannik; Tyrchan, Christian; Reymond, Jean-Louis; Chen, Hongming; Engkvist, Ola",
    "doi": "10.1186/s13321-019-0393-0",
    "id": "ArusPous-2019-RandomizedSMILES",
    "journal": "Journal of Cheminformatics",
    "keywords": "SMILES augmentation, Randomized SMILES, RNN, LSTM, GRU, GDB-13, ChEMBL, Molecular Generative Models",
    "pages": "Article number: 71",
    "publisher": "Springer",
    "title": "Randomized SMILES strings improve the quality of molecular generative models",
    "type": "article",
    "url": "https://jcheminf.biomedcentral.com/articles/10.1186/s13321-019-0393-0",
    "volume": "11",
    "year": 2019
  },
  {
    "abstract": "As transfer learning from large-scale pre-trained models becomes more prevalent in natural language processing (NLP), operating these large models in on-the-edge and/or under constrained computational training or inference budgets remains challenging. In this work, we propose a method to pre-train a smaller general-purpose language representation model, called DistilBERT, which can then be fine-tuned with good performances on a wide range of tasks like its larger counterparts. While most prior work investigated the use of distillation for building task-specific models, we leverage knowledge distillation during the pre-training phase and show that it is possible to reduce the size of a BERT model by 40%, while retaining 97% of its language understanding capabilities and being 60% faster. To leverage the inductive biases learned by larger models during pre-training, we introduce a triple loss combining language modeling, distillation and cosine-distance losses. Our smaller, faster and lighter model is cheaper to pre-train and we demonstrate its capabilities for on-device computations in a proof-of-concept experiment and a comparative on-device study.",
    "author": "Sanh, Victor; Debut, Lysandre; Chaumond, Julien; Wolf, Thomas",
    "id": "Sanh-2019-DistilBERT",
    "journal": "Proceedings of EMC$^2$: Efficient Machine Learning, Co-located with NeurIPS 2019",
    "keywords": "DistilBERT, knowledge distillation, model compression, language modeling, BERT, NLP",
    "note": "Presented at EMC²: Efficient Machine Learning, NeurIPS 2019",
    "title": "DistilBERT, a distilled version of BERT: smaller, faster, cheaper and lighter",
    "type": "article",
    "url": "https://arxiv.org/abs/1910.01108",
    "year": 2019
  },
  {
    "abstract": "Natural language processing tasks, such as question answering, machine translation, reading comprehension, and summarization, are typically approached with supervised learning on task-specific datasets. We demonstrate that language models begin to learn these tasks without any explicit supervision when trained on a new dataset of millions of webpages called WebText. When conditioned on a document plus questions, the answers generated by the language model reach 55 F1 on the CoQA dataset—matching or exceeding the performance of 3 out of 4 baseline systems without using the 127,000+ training examples. The capacity of the language model is essential to the success of zero-shot task transfer and increasing it improves performance in a log-linear fashion across tasks. Our largest model, GPT-2, is a 1.5B parameter Transformer that achieves state-of-the-art results on 7 out of 8 tested language modeling datasets in a zero-shot setting but still underfits WebText. Samples from the model reflect these improvements and contain coherent paragraphs of text. These findings suggest a promising path towards building language processing systems which learn to perform tasks from their naturally occurring demonstrations.",
    "author": "Radford, Alec; Wu, Jeffrey; Child, Rewon; Luan, David; Amodei, Dario; Sutskever, Ilya",
    "id": "Radford-2019-GPT2",
    "note": "Technical report, no DOI.",
    "publisher": "OpenAI",
    "title": "Language models are unsupervised multitask learners",
    "type": "techreport",
    "url": "https://cdn.openai.com/better-language-models/language_models_are_unsupervised_multitask_learners.pdf",
    "year": 2019
  },
  {
    "abstract": "Comparing probability distributions is a fundamental problem in data sciences. Simple norms and divergences such as the total variation and the relative entropy only compare densities in a point-wise manner and fail to capture the geometric nature of the problem. In sharp contrast, Maximum Mean Discrepancies (MMD) and Optimal Transport distances (OT) are two classes of distances between measures that take into account the geometry of the underlying space and metrize the convergence in law. This paper studies the Sinkhorn divergences, a family of geometric divergences that interpolates between MMD and OT. Relying on a new notion of geometric entropy, we provide theoretical guarantees for these divergences: positivity, convexity and metrization of the convergence in law. On the practical side, we detail a numerical scheme that enables the large scale application of these divergences for machine learning: on the GPU, gradients of the Sinkhorn loss can be computed for batches of a million samples.",
    "author": "Feydy, Jean; Séjourné, Thibault; Vialard, François-Xavier; Amari, Shun-ichi; Trouvé, Alain; Peyré, Gabriel",
    "booktitle": "The 22nd International Conference on Artificial Intelligence and Statistics",
    "id": "Feydy-2019-Sinkhorn",
    "note": "Preprint available at arXiv:1810.08278. Peer-reviewed version presented at AISTATS 2019. Code: https://github.com/jeanfeydy/geomloss | Docs: https://www.kernel-operations.io/geomloss",
    "pages": "2681–2690",
    "title": "Interpolating between Optimal Transport and MMD using Sinkhorn Divergences",
    "type": "inproceedings",
    "url": "https://arxiv.org/abs/1810.08278",
    "year": 2019
  },
  {
    "abstract": "When confronted with a substance of unknown identity, researchers often perform mass spectrometry on the sample and compare the observed spectrum to a library of previously collected spectra to identify the molecule. While popular, this approach will fail to identify molecules that are not in the existing library. In response, we propose to improve the library’s coverage by augmenting it with synthetic spectra that are predicted from candidate molecules using machine learning. We contribute a lightweight neural network model that quickly predicts mass spectra for small molecules, averaging 5 ms per molecule with a recall-at-10 accuracy of 91.8%. Achieving high-accuracy predictions requires a novel neural network architecture that is designed to capture typical fragmentation patterns from electron ionization. We analyze the effects of our modeling innovations on library matching performance and compare our models to prior machine-learning-based work on spectrum prediction.",
    "author": "Wei, Jennifer N.; Belanger, David; Adams, Ryan P.; Sculley, D.",
    "doi": "10.1021/acscentsci.9b00085",
    "id": "Wei-2019-NEIMS",
    "journal": "ACS Central Science",
    "number": "4",
    "pages": "700–708",
    "title": "Rapid prediction of electron–ionization mass spectrometry using neural networks",
    "type": "article",
    "url": "https://pubs.acs.org/doi/10.1021/acscentsci.9b00085",
    "volume": "5",
    "year": 2019
  },
  {
    "abstract": "Addresses the challenge of inferring molecular structures from spectroscopic data using deep imitation learning.",
    "author": "Jonas, Eric",
    "booktitle": "Advances in Neural Information Processing Systems",
    "id": "Jonas-2019-DeepImitation",
    "keywords": "Imitation Learning, Molecular Inverse Problems, NMR Spectroscopy, Neural Networks",
    "note": "Presented at NeurIPS 2019",
    "title": "Deep imitation learning for molecular inverse problems",
    "type": "inproceedings",
    "url": "https://proceedings.neurips.cc/paper/2019/hash/b0bef4c9a6e50d43880191492d4fc827-Abstract.html",
    "volume": "32",
    "year": 2019
  },
  {
    "author": "Zhang, Richard",
    "booktitle": "Proceedings of the 36th International Conference on Machine Learning",
    "id": "Zhang-2019-ShiftInvariant",
    "journal": "Proceedings of the 36th International Conference on Machine Learning",
    "pages": "7324–7334",
    "publisher": "PMLR",
    "title": "Making convolutional networks shift-invariant again",
    "type": "proceedings-article",
    "url": "https://proceedings.mlr.press/v97/zhang19a.html",
    "volume": "97",
    "year": 2019
  },
  {
    "abstract": "The IMPRESSION (Intelligent Machine PREdiction of Shift and Scalar information Of Nuclei) machine learning system provides an efficient and accurate method for the prediction of NMR parameters from 3-dimensional molecular structures. Here we demonstrate that machine learning predictions of NMR parameters, trained on quantum chemical computed values, can be as accurate as, but computationally much more efficient (tens of milliseconds per molecular structure) than, quantum chemical calculations (hours/days per molecular structure) starting from the same 3-dimensional structure. Training the machine learning system on quantum chemical predictions, rather than experimental data, circumvents the need for the existence of large, structurally diverse, error-free experimental databases and makes IMPRESSION applicable to solving 3-dimensional problems such as molecular conformation and stereoisomerism.",
    "author": "Gerrard, Will; Bratholm, Lars A.; Packer, Martin J.; Mulholland, Adrian J.; Glowacki, David R.; Butts, Craig P.",
    "doi": "10.1039/c9sc03854j",
    "id": "Gerrard-2019-IMPRESSION",
    "journal": "Chemical Science",
    "pages": "508-515",
    "title": "IMPRESSION – prediction of NMR parameters for 3-dimensional chemical structures using machine learning with near quantum chemical accuracy",
    "type": "article",
    "url": "https://doi.org/10.1039/c9sc03854j",
    "volume": "11",
    "year": 2019
  },
  {
    "abstract": "Organic synthesis is one of the key stumbling blocks in medicinal chemistry. A necessary yet unsolved step in planning synthesis is solving the forward problem: given reactants and reagents, predict the products. Similar to other work, we treat reaction prediction as a machine translation problem between SMILES strings of reactants-reagents and the products. We show that a multi-head attention Molecular Transformer model outperforms all algorithms in the literature, achieving a top-1 accuracy above 90% on a common benchmark dataset. Our algorithm requires no handcrafted rules, and accurately predicts subtle chemical transformations. Crucially, our model can accurately estimate its own uncertainty, with an uncertainty score that is 89% accurate in terms of classifying whether a prediction is correct. Furthermore, we show that the model is able to handle inputs without reactant-reagent split and including stereochemistry, which makes our method universally applicable.",
    "author": "Schwaller, Philippe; Laino, Teodoro; Gaudin, Théophile; Bolgar, Peter; Hunter, Christopher A.; Bekas, Costas; Lee, Alpha A.",
    "doi": "10.1021/acscentsci.9b00576",
    "id": "Schwaller-2019-MolecularTransformer",
    "journal": "ACS Central Science",
    "keywords": "Molecular Transformer, reaction prediction, uncertainty calibration, deep learning, SMILES, chemical synthesis",
    "number": "9",
    "pages": "1572-1583",
    "title": "Molecular Transformer: A Model for Uncertainty-Calibrated Chemical Reaction Prediction",
    "type": "article",
    "url": "https://doi.org/10.1021/acscentsci.9b00576",
    "volume": "5",
    "year": 2019
  },
  {
    "author": "Taherian Fard, Atefeh; Ragan, Mark A.",
    "doi": "10.1007/978-1-4939-9224-9_7",
    "id": "Taherian-2019-Waddington",
    "journal": "Methods in Molecular Biology",
    "keywords": "Waddington epigenetic landscape, quantitative modeling, stem cell biology, differentiation",
    "pages": "157-171",
    "publisher": "Springer New York",
    "title": "Quantitative Modelling of the Waddington Epigenetic Landscape",
    "type": "book-chapter",
    "url": "https://doi.org/10.1007/978-1-4939-9224-9_7",
    "volume": "1",
    "year": 2019
  },
  {
    "abstract": "Current status and applications of genome-scale metabolic models.",
    "author": "Gu, Changdai; Kim, Gi Bae; Kim, Won Jun; Kim, Hyun Uk; Lee, Sang Yup",
    "doi": "10.1186/s13059-019-1730-3",
    "id": "Gu-2019-GEMApplications",
    "journal": "Genome Biology",
    "keywords": "Genome-scale metabolic models, applications, metabolism, systems biology",
    "number": "1",
    "pages": "121",
    "publisher": "Springer Science and Business Media LLC",
    "title": "Current status and applications of genome-scale metabolic models",
    "type": "article",
    "url": "https://doi.org/10.1186/s13059-019-1730-3",
    "volume": "20",
    "year": 2019
  },
  {
    "abstract": "Gene2Vec explores distributed representations for genes based on co-expression data. Leveraging large-scale datasets from the GEO database, this study introduces a vectorized embedding model that captures gene functional similarities, enabling insights into biological pathways and gene-gene interactions. The approach demonstrates improved clustering and predictions, contributing to bioinformatics applications such as gene interaction analysis and functional annotation.",
    "author": "Du, Jingcheng; Jia, Peilin; Dai, Yulin; Tao, Cui; Zhao, Zhongming; Zhi, Degui",
    "doi": "10.1186/s12864-018-5370-x",
    "id": "Du-2019-Gene2Vec",
    "journal": "BMC Genomics",
    "keywords": "Gene2Vec; Gene embedding; Single-cell RNA-seq; Co-expression; Functional genomics; Bioinformatics",
    "number": "",
    "pages": "7",
    "title": "Gene2Vec: Distributed Representation of Genes Based on Co-Expression",
    "type": "article",
    "url": "https://doi.org/10.1186/s12864-018-5370-x",
    "volume": "19",
    "year": 2019
  },
  {
    "abstract": "In this paper, we propose a latent feature group learning (LFGL) algorithm to discover feature grouping structures and subspace clusters for high-dimensional data. The feature grouping structures are learned analytically to enhance clustering accuracy and efficiency. LFGL employs Darwinian evolutionary processes to explore optimal feature groupings and mass-based dissimilarity measures rather than Euclidean distances, optimizing weights using nonnegative matrix factorization under orthogonality constraints. Experimental results demonstrate the LFGL algorithm's superiority in clustering high-dimensional datasets.",
    "author": "Lin, Ya-Wei Eileen; Coifman, Ronald; Mishne, Gal; Talmon, Ronen",
    "doi": "10.3390/info10060208",
    "id": "Lin-2019-TreeWassersteinHighDim",
    "journal": "Information",
    "keywords": "Tree-Wasserstein; Latent Feature Hierarchy; High-Dimensional Data; Optimal Transport; Nonnegative Matrix Factorization",
    "number": "6",
    "pages": "208",
    "title": "Tree-Wasserstein Distance for High-Dimensional Data with a Latent Feature Hierarchy",
    "type": "article",
    "url": "https://doi.org/10.3390/info10060208",
    "volume": "10",
    "year": 2019
  },
  {
    "abstract": "Understanding the molecular programs that guide differentiation during development is a major challenge. Here, we introduce Waddington-OT, an approach for studying developmental time courses to infer ancestor-descendant fates and model the regulatory programs that underlie them. We apply the method to reconstruct the landscape of reprogramming from 315,000 single-cell RNA sequencing (scRNA-seq) profiles, collected at half-day intervals across 18 days. The results reveal a wider range of developmental programs than previously characterized. Cells gradually adopt either a terminal stromal state or a mesenchymal-to-epithelial transition state. The latter gives rise to populations related to pluripotent, extra-embryonic, and neural cells, with each harboring multiple finer subpopulations. The analysis predicts transcription factors and paracrine signals that affect fates, and experiments validate that the TF Obox6 and the cytokine GDF9 enhance reprogramming efficiency. Our approach sheds light on the process and outcome of reprogramming and provides a framework applicable to diverse temporal processes in biology.",
    "author": "Schiebinger, Geoffrey; Shu, Jian; Tabaka, Marcin; Cleary, Brian; Subramanian, Vidya; Solomon, Aryeh; Gould, Joshua; Liu, Siyan; Lin, Stacie; Berube, Peter; Lee, Lia; Chen, Jenny; Brumbaugh, Justin; Rigollet, Philippe; Hochedlinger, Konrad; Jaenisch, Rudolf; Regev, Aviv; Lander, Eric S.",
    "doi": "10.1016/j.cell.2019.02.026",
    "id": "Schiebinger-2019-OptimalTransport",
    "journal": "Cell",
    "keywords": "Single-cell RNA sequencing; Developmental trajectories; Cellular reprogramming; Optimal transport; Waddington-OT; Transcription factors; Mesenchymal-to-epithelial transition; Pluripotency; Gene expression analysis; Cellular differentiation",
    "number": "6",
    "pages": "928–943",
    "title": "Optimal transport analysis of single-cell gene expression identifies developmental trajectories in reprogramming",
    "type": "article",
    "url": "https://doi.org/10.1016/j.cell.2019.02.026",
    "volume": "176",
    "year": 2019
  },
  {
    "abstract": "This chapter explores asynchronous stochastic variational inference as applied to distributed systems, detailing methods for improving convergence and computational efficiency in machine learning tasks.",
    "author": "Mohamad, Saad; Bouchachia, Abdelhamid; Sayed-Mouchaweh, Moamar",
    "doi": "10.1007/978-3-030-16841-4_31",
    "id": "Mohamad-2019-AsynchronousSVI",
    "journal": "Proceedings of the International Neural Networks Society, Recent Advances in Big Data and Deep Learning",
    "keywords": "Asynchronous SVI; distributed optimization; machine learning",
    "number": "",
    "pages": "296-308",
    "publisher": "Springer International Publishing",
    "title": "Asynchronous Stochastic Variational Inference",
    "type": "book-chapter",
    "url": "https://doi.org/10.1007/978-3-030-16841-4_31",
    "volume": "",
    "year": 2019
  },
  {
    "abstract": "This paper introduces UniRep, a deep learning model that generates protein sequence embeddings capturing structural and functional information, facilitating various protein engineering tasks.",
    "author": "Alley, Ethan C.; Khimulya, Grigory; Biswas, Surojit; AlQuraishi, Mohammed; Church, George M.",
    "doi": "10.1038/s41592-019-0598-1",
    "id": "Alley-2019-UniRep",
    "journal": "Nature Methods",
    "keywords": "UniRep, Protein engineering, Sequence embeddings, Deep learning, Structural biology",
    "number": "12",
    "pages": "1315–1322",
    "title": "Unified rational protein engineering with sequence-based deep representation learning",
    "type": "article",
    "volume": "16",
    "year": 2019
  },
  {
    "abstract": "This paper introduces TAPE (Tasks Assessing Protein Embeddings), a benchmark suite for evaluating protein transfer learning. It includes five biologically relevant tasks and provides standardized datasets and evaluation techniques to facilitate progress in protein modeling.",
    "author": "Rao, Roshan; Bhattacharya, Nicholas; Thomas, Neil; Duan, Yan; Chen, Xi; Canny, John; Abbeel, Pieter; Song, Yun S.",
    "doi": "10.48550/arXiv.1906.08230",
    "id": "Rao-2019-TAPE",
    "journal": "Advances in Neural Information Processing Systems (NeurIPS)",
    "keywords": "TAPE, Protein transfer learning, Benchmark suite, Protein embeddings, Evaluation",
    "pages": "9689-9701",
    "title": "Evaluating Protein Transfer Learning with TAPE",
    "type": "article",
    "year": 2019
  },
  {
    "abstract": "AlphaZero is a single general-purpose reinforcement learning system that mastered chess, shogi, and Go through self-play without any domain-specific adaptations. It exceeded the performance of specialized programs in each game and represents a major advance toward general game-playing AI.",
    "author": "Silver, David; Hubert, Thomas; Schrittwieser, Julian; Antonoglou, Ioannis; Lai, Matthew; Guez, Arthur; Lanctot, Marc; Sifre, Laurent; Kumaran, Dharshan; Graepel, Thore; Lillicrap, Timothy; Simonyan, Karen; Hassabis, Demis",
    "doi": "10.1126/science.aar6404",
    "id": "Silver-2018-AlphaZero",
    "journal": "Science",
    "number": "6419",
    "pages": "1140–1144",
    "title": "A general reinforcement learning algorithm that masters chess, shogi, and Go through self-play",
    "type": "article",
    "url": "https://doi.org/10.1126/science.aar6404",
    "volume": "362",
    "year": 2018
  },
  {
    "abstract": "A large scale benchmark for molecular machine learning consisting of multiple public datasets, metrics, featurizations and learning algorithms.",
    "author": "Wu, Zhenqin; Ramsundar, Bharath; Feinberg, Evan N.; Gomes, Joseph; Geniesse, Caleb; Pappu, Aneesh S.; Leswing, Karl; Pande, Vijay",
    "doi": "10.1039/c7sc02664a",
    "id": "Wu-2018-MoleculeNet",
    "journal": "Chemical Science",
    "note": "Open access PDF available: https://pubs.rsc.org/en/content/articlepdf/2018/sc/c7sc02664a",
    "number": "2",
    "pages": "513 - 530",
    "title": "MoleculeNet: a benchmark for molecular machine learning",
    "type": "article",
    "url": "https://doi.org/10.1039/c7sc02664a",
    "volume": "9",
    "year": 2018
  },
  {
    "abstract": "We describe a SMILES-like syntax called DeepSMILES that addresses two of the main reasons for invalid syntax when using a probabilistic model to generate SMILES strings. The DeepSMILES syntax avoids the problem of unbalanced parentheses by only using close parentheses, where the number of parentheses indicates the branch length. In addition, DeepSMILES avoids the problem of pairing ring closure symbols by using only a single symbol at the ring closing location, where the symbol indicates the ring size. We show that this syntax can be interconverted to/from SMILES with string processing without any loss of information, including stereo configuration.",
    "author": "O'Boyle, Noel; Dalke, Andrew",
    "doi": "10.26434/chemrxiv.7097960.v1",
    "id": "OBoyle-2018-DeepSMILES",
    "journal": "ChemRxiv",
    "keywords": "DeepSMILES, SMILES, cheminformatics, machine learning, molecular representation",
    "title": "DeepSMILES: An adaptation of SMILES for use in machine-learning of chemical structures",
    "type": "article",
    "url": "https://doi.org/10.26434/chemrxiv.7097960.v1",
    "year": 2018
  },
  {
    "abstract": "Chemical autoencoders are attractive models as they combine chemical space navigation with possibilities for de novo molecule generation in areas of interest. This enables them to produce focused chemical libraries around a single lead compound for employment early in a drug discovery project. Here, it is shown that the choice of chemical representation, such as strings from the simplified molecular-input line-entry system (SMILES), has a large influence on the properties of the latent space. It is further explored to what extent translating between different chemical representations influences the latent space similarity to the SMILES strings or circular fingerprints. By employing SMILES enumeration for either the encoder or decoder, it is found that the decoder has the largest influence on the properties of the latent space. Training a sequence to sequence heteroencoder based on recurrent neural networks (RNNs) with long short-term memory cells (LSTM) to predict different enumerated SMILES strings from the same canonical SMILES string gives the largest similarity between latent space distance and molecular similarity measured as circular fingerprints similarity. Using the output from the code layer in quantitative structure activity relationship (QSAR) of five molecular datasets shows that heteroencoder derived vectors markedly outperforms autoencoder derived vectors as well as models built using ECFP4 fingerprints, underlining the increased chemical relevance of the latent space. However, the use of enumeration during training of the decoder leads to a marked increase in the rate of decoding to different molecules than encoded, a tendency that can be counteracted with more complex network architectures.",
    "author": "Bjerrum, Esben Jannik; Sattarov, Boris",
    "doi": "10.3390/biom8040131",
    "id": "Bjerrum-2018-Heteroencoders",
    "journal": "Biomolecules",
    "number": "4",
    "pages": "131",
    "publisher": "MDPI",
    "title": "Improving chemical autoencoder latent space and molecular de novo generation diversity with heteroencoders",
    "type": "article",
    "url": "https://doi.org/10.3390/biom8040131",
    "volume": "8",
    "year": 2018
  },
  {
    "abstract": "Distributionally Robust Supervised Learning (DRSL) is necessary for building reliable machine learning systems. When machine learning is deployed in the real world, its performance can be significantly degraded because test data may follow a different distribution from training data. DRSL with f-divergences explicitly considers the worst-case distribution shift by minimizing the adversarially reweighted training loss. In this paper, we analyze this DRSL, focusing on the classification scenario. Since the DRSL is explicitly formulated for a distribution shift scenario, we naturally expect it to give a robust classifier that can aggressively handle shifted distributions. However, surprisingly, we prove that the DRSL just ends up giving a classifier that exactly fits the given training distribution, which is too pessimistic. This pessimism comes from two sources: the particular losses used in classification and the fact that the variety of distributions to which the DRSL tries to be robust is too wide. Motivated by our analysis, we propose simple DRSL that overcomes this pessimism and empirically demonstrate its effectiveness.",
    "author": "Hu, Weihua; Niu, Gang; Sato, Issei; Sugiyama, Masashi",
    "booktitle": "Proceedings of the 35th International Conference on Machine Learning",
    "id": "Hu-2018-DRSLClassifiers",
    "pages": "2029-2037",
    "publisher": "PMLR",
    "title": "Does distributionally robust supervised learning give robust classifiers?",
    "type": "inproceedings",
    "url": "https://proceedings.mlr.press/v80/hu18a.html",
    "volume": "80",
    "year": 2018
  },
  {
    "abstract": "Neural networks are vulnerable to adversarial examples and researchers have proposed many heuristic attack and defense mechanisms. We address this problem through the principled lens of distributionally robust optimization, which guarantees performance under adversarial input perturbations. By considering a Lagrangian penalty formulation of perturbing the underlying data distribution in a Wasserstein ball, we provide a training procedure that augments model parameter updates with worst-case perturbations of training data. For smooth losses, our procedure provably achieves moderate levels of robustness with little computational or statistical cost relative to empirical risk minimization. Furthermore, our statistical guarantees allow us to efficiently certify robustness for the population loss. For imperceptible perturbations, our method matches or outperforms heuristic approaches.",
    "author": "Sinha, Aman; Namkoong, Hongseok; Duchi, John C.",
    "doi": "10.48550/arXiv.1710.10571",
    "id": "Namkoong-2018-VarianceDRO",
    "journal": "International Conference on Learning Representations (ICLR)",
    "title": "Certifying some distributional robustness with principled adversarial training",
    "type": "inproceedings",
    "url": "https://doi.org/10.48550/arXiv.1710.10571",
    "year": 2018
  },
  {
    "abstract": "Numerous deep learning applications benefit from multi-task learning with multiple regression and classification objectives. In this paper we make the observation that the performance of such systems is strongly dependent on the relative weighting between each task’s loss. Tuning these weights by hand is a difficult and expensive process, making multi-task learning prohibitive in practice. We propose a principled approach to multi-task deep learning which weighs multiple loss functions by considering the homoscedastic uncertainty of each task. This allows us to simultaneously learn various quantities with different units or scales in both classification and regression settings. We demonstrate our model learning per-pixel depth regression, semantic and instance segmentation from a monocular input image. Perhaps surprisingly, we show our model can learn multi-task weightings and outperform separate models trained individually on each task.",
    "author": "Cipolla, Roberto; Gal, Yarin; Kendall, Alex",
    "doi": "10.1109/CVPR.2018.00781",
    "id": "Kendall-2018-MultiTaskUncertainty",
    "journal": "2018 IEEE/CVF Conference on Computer Vision and Pattern Recognition",
    "note": "Conference paper",
    "title": "Multi-task learning using uncertainty to weigh losses for scene geometry and semantics",
    "type": "article",
    "url": "https://doi.org/10.1109/CVPR.2018.00781",
    "year": 2018
  },
  {
    "abstract": "Inspired by natural language processing techniques, we here introduce Mol2vec, which is an unsupervised machine learning approach to learn vector representations of molecular substructures. Like the Word2vec models, where vectors of closely related words are in close proximity in the vector space, Mol2vec learns vector representations of molecular substructures that point in similar directions for chemically related substructures. Compounds can finally be encoded as vectors by summing the vectors of the individual substructures and, for instance, be fed into supervised machine learning approaches to predict compound properties. The underlying substructure vector embeddings are obtained by training an unsupervised machine learning approach on a so-called corpus of compounds that consists of all available chemical matter. The resulting Mol2vec model is pretrained once, yields dense vector representations, and overcomes drawbacks of common compound feature representations such as sparseness and bit collisions. The prediction capabilities are demonstrated on several compound property and bioactivity data sets and compared with results obtained for Morgan fingerprints as a reference compound representation. Mol2vec can be easily combined with ProtVec, which employs the same Word2vec concept on protein sequences, resulting in a proteochemometric approach that is alignment-independent and thus can also be easily used for proteins with low sequence similarities.",
    "author": "Jaeger, Sabrina; Fulle, Simone; Turk, Samo",
    "doi": "10.1021/acs.jcim.7b00616",
    "id": "Jaeger-2018-Mol2vec",
    "journal": "Journal of Chemical Information and Modeling",
    "number": "1",
    "pages": "27–35",
    "publisher": "American Chemical Society",
    "title": "Mol2vec: Unsupervised machine learning approach with chemical intuition",
    "type": "article",
    "url": "https://doi.org/10.1021/acs.jcim.7b00616",
    "volume": "58",
    "year": 2018
  },
  {
    "abstract": "Motivation: Metabolites, small molecules that are involved in cellular reactions, provide a direct functional signature of cellular state. Untargeted metabolomics experiments usually rely on tandem mass spectrometry to identify the thousands of compounds in a biological sample. Recently, we presented CSI:FingerID for searching in molecular structure databases using tandem mass spectrometry data. CSI:FingerID predicts a molecular fingerprint that encodes the structure of the query compound, then uses this to search a molecular structure database such as PubChem. Scoring of the predicted query fingerprint and deterministic target fingerprints is carried out assuming independence between the molecular properties constituting the fingerprint. Results: We present a scoring that takes into account dependencies between molecular properties. As before, we predict posterior probabilities of molecular properties using machine learning. Dependencies between molecular properties are modeled as a Bayesian tree network; the tree structure is estimated on the fly from the instance data. For each edge, we also estimate the expected covariance between the two random variables. For fixed marginal probabilities, we then estimate conditional probabilities using the known covariance. Now, the corrected posterior probability of each candidate can be computed, and candidates are ranked by this score. Modeling dependencies improves identification rates of CSI:FingerID by 2.85 percentage points. Availability and implementation: The new scoring Bayesian (fixed tree) is integrated into SIRIUS 4.0 (https://bio.informatik.uni-jena.de/software/sirius/).",
    "author": "Ludwig, Marcus; Dührkop, Kai; Böcker, Sebastian",
    "doi": "10.1093/bioinformatics/bty245",
    "id": "Ludwig-2018-BayesianMass",
    "journal": "Bioinformatics",
    "number": "13",
    "pages": "i333–i340",
    "title": "Bayesian networks for mass spectrometric metabolite identification via molecular fingerprints",
    "type": "article",
    "url": "https://doi.org/10.1093/bioinformatics/bty245",
    "volume": "34",
    "year": 2018
  },
  {
    "author": "Paruzzo, Federico M.; Hofstetter, Albert; Musil, Félix; De, Sandip; Ceriotti, Michele; Emsley, Lyndon",
    "doi": "10.1038/s41467-018-06972-x",
    "id": "Paruzzo-2018-ChemicalShifts",
    "journal": "Nature Communications",
    "number": "1",
    "pages": "4501",
    "title": "Chemical shifts in molecular solids by machine learning",
    "type": "article",
    "url": "https://doi.org/10.1038/s41467-018-06972-x",
    "volume": "9",
    "year": 2018
  },
  {
    "abstract": "Using a text-based representation of molecules, chemical reactions are predicted with a neural machine translation model borrowed from language processing.",
    "author": "Schwaller, Philippe; Gaudin, Théophile; Lányi, Dàvid; Bekas, Costas; Laino, Teodoro",
    "doi": "10.1039/c8sc02339e",
    "id": "Schwaller-2018-FoundInTranslation",
    "journal": "Chemical Science",
    "keywords": "reaction prediction, sequence-to-sequence models, neural machine translation, organic chemistry",
    "number": "28",
    "pages": "6091-6098",
    "title": "Found in translation: Predicting outcomes of complex organic chemistry reactions using neural sequence-to-sequence models",
    "type": "article",
    "url": "https://doi.org/10.1039/c8sc02339e",
    "volume": "9",
    "year": 2018
  },
  {
    "abstract": "This paper introduces COBRAme, a Python-based software framework designed to build and simulate ME-models (genome-scale models of metabolism and gene expression). Built on COBRApy, COBRAme streamlines the computation and analysis of ME-models, facilitating their construction and editing for various organisms. The authors demonstrate the utility of COBRAme by reconstructing a condensed E. coli ME-model, achieving significant improvements in computational efficiency and model accuracy.",
    "author": "Lloyd, Colton J.; Ebrahim, Ali; Yang, Laurence; King, Zachary A.; Catoiu, Edward; O’Brien, Edward J.; Liu, Joanne K.; Palsson, Bernhard Ø.",
    "doi": "10.1371/journal.pcbi.1006302",
    "id": "Lloyd-2018-COBRAme",
    "journal": "PLOS Computational Biology",
    "keywords": "COBRAme, genome-scale models, metabolism, gene expression, computational biology",
    "number": "7",
    "pages": "e1006302",
    "publisher": "PLOS",
    "title": "COBRAme: A Computational Framework for Genome-Scale Models of Metabolism and Gene Expression",
    "type": "article",
    "url": "https://doi.org/10.1371/journal.pcbi.1006302",
    "volume": "14",
    "year": 2018
  },
  {
    "abstract": "Single-cell RNA-sequencing (scRNA-seq) offers unprecedented resolution for studying cellular decision-making processes. scEpath calculates energy landscapes and probabilistic directed graphs to reconstruct developmental trajectories.",
    "author": "Jin, Suoqin; MacLean, Adam L.; Peng, Tao; Nie, Qing",
    "doi": "10.1093/bioinformatics/bty058",
    "id": "Jin-2018-scEpath",
    "journal": "Bioinformatics",
    "keywords": "scEpath, single-cell RNA-seq, energy landscapes, developmental trajectories",
    "number": "12",
    "pages": "2077-2086",
    "publisher": "Oxford University Press",
    "title": "scEpath: energy landscape-based inference of transition probabilities and cellular trajectories from single-cell transcriptomic data",
    "type": "article",
    "url": "https://doi.org/10.1093/bioinformatics/bty058",
    "volume": "34",
    "year": 2018
  },
  {
    "abstract": "Single-cell RNA sequencing can reveal RNA abundance with high quantitative accuracy, sensitivity and throughput. However, this approach captures only a static snapshot at a point in time, posing a challenge for the analysis of time-resolved phenomena such as embryogenesis or tissue regeneration. Here we show that RNA velocity—the time derivative of the gene expression state—can be directly estimated by distinguishing between unspliced and spliced mRNAs in common single-cell RNA sequencing protocols. RNA velocity is a high-dimensional vector that predicts the future state of individual cells on a timescale of hours. We validate its accuracy in the neural crest lineage, demonstrate its use on multiple published datasets and technical platforms, reveal the branching lineage tree of the developing mouse hippocampus, and examine the kinetics of transcription in human embryonic brain. We expect RNA velocity to greatly aid the analysis of developmental lineages and cellular dynamics, particularly in humans.",
    "author": "La Manno, Giovanni; Soldatov, Ruslan; Zeisel, Amit; Braun, Emelie; Hochgerner, Hannah; Petukhov, Viktor; Lidschreiber, Katja; Kastriti, Maria; Lönnerberg, Peter; Furlan, Alessandro; Fan, Jean; Borm, Lars E.; Liu, Zehua; van Bruggen, Danny; Guo, Jimin; He, Xiaoling; Barker, Roger; Sundström, Erik; Castelo-Branco, Gonçalo; Cramer, Patrick; Adameyko, Igor; Linnarsson, Sten; Kharchenko, Peter",
    "doi": "10.1038/s41586-018-0414-6",
    "id": "LaManno-2018-RNAVelocity",
    "journal": "Nature",
    "number": "7719",
    "pages": "494-498",
    "title": "RNA velocity of single cells",
    "type": "article",
    "url": "https://doi.org/10.1038/s41586-018-0414-6",
    "volume": "560",
    "year": 2018
  },
  {
    "abstract": "A long-standing goal of artificial intelligence is an algorithm that learns, tabula rasa, superhuman proficiency in challenging domains. Recently, AlphaGo became the first program to defeat a world champion in the game of Go. The tree search in AlphaGo evaluated positions and selected moves using deep neural networks. These neural networks were trained by supervised learning from human expert moves, and by reinforcement learning from self-play. Here we introduce an algorithm based solely on reinforcement learning, without human data, guidance or domain knowledge beyond game rules. AlphaGo becomes its own teacher: a neural network is trained to predict AlphaGo’s own move selections and also the winner of AlphaGo’s games. This neural network improves the strength of the tree search, resulting in higher quality move selection and stronger self-play in the next iteration. Starting tabula rasa, our new program AlphaGo Zero achieved superhuman performance, winning 100–0 against the previously published, champion-defeating AlphaGo.",
    "author": "Silver, David; Schrittwieser, Julian; Simonyan, Karen; Antonoglou, Ioannis; Huang, Aja; Guez, Arthur; Hubert, Thomas; Baker, Lucas; Lai, Matthew; Bolton, Adrian; Chen, Yutian; Lillicrap, Timothy; Hui, Fan; Sifre, Laurent; van den Driessche, George; Graepel, Thore; Hassabis, Demis",
    "doi": "10.1038/nature24270",
    "id": "Silver-2017-GoZero",
    "journal": "Nature",
    "pages": "354–359",
    "title": "Mastering the game of Go without human knowledge",
    "type": "article",
    "url": "https://doi.org/10.1038/nature24270",
    "volume": "550",
    "year": 2017
  },
  {
    "abstract": "L2 regularization and weight decay regularization are equivalent for SGD, but not for adaptive gradient algorithms like Adam. This work proposes decoupling weight decay from the loss-based optimization step, improving generalization and allowing Adam to match SGD on image classification tasks.",
    "author": "Loshchilov, Ilya; Hutter, Frank",
    "booktitle": "",
    "doi": "10.48550/arXiv.1711.05101",
    "id": "Loshchilov-2017-AdamW",
    "journal": "arXiv",
    "keywords": "",
    "note": "Presented as a poster at ICLR 2019",
    "number": "1711.05101",
    "pages": "",
    "publisher": "",
    "title": "Decoupled weight decay regularization",
    "type": "article",
    "url": "https://arxiv.org/abs/1711.05101",
    "volume": "",
    "year": 2017
  },
  {
    "abstract": "This work introduces a method to tune a sequence-based generative model for molecular de novo design that through augmented episodic likelihood can learn to generate structures with certain specified desirable properties. We demonstrate how this model can execute a range of tasks such as generating analogues to a query structure and generating compounds predicted to be active against a biological target. As a proof of principle, the model is first trained to generate molecules that do not contain sulphur. As a second example, the model is trained to generate analogues to the drug Celecoxib, a technique that could be used for scaffold hopping or library expansion starting from a single molecule. Finally, when tuning the model towards generating compounds predicted to be active against the dopamine receptor type 2, the model generates structures of which more than 95% are predicted to be active, including experimentally confirmed actives that have not been included in either the generative model nor the activity prediction model.",
    "author": "Olivecrona, Marcus; Blaschke, Thomas; Engkvist, Ola; Chen, Hongming",
    "doi": "10.1186/s13321-017-0235-x",
    "id": "Olivecrona-2017-DeNovoRL",
    "journal": "Journal of Cheminformatics",
    "keywords": "deep reinforcement learning; de novo drug design; molecular generation; generative models; cheminformatics; SMILES; molecular optimization",
    "pages": "48",
    "title": "Molecular de-novo design through deep reinforcement learning",
    "type": "article",
    "url": "https://doi.org/10.1186/s13321-017-0235-x",
    "volume": "9",
    "year": 2017
  },
  {
    "abstract": "SMILES2Vec is a deep recurrent neural network that learns features directly from SMILES strings to predict chemical properties. It achieves competitive results across tasks like toxicity and solubility prediction and includes interpretability features highlighting influential molecular fragments.",
    "author": "Goh, Garrett B.; Hodas, Nathan Oken; Siegel, Charles; Vishnu, Abhinav",
    "booktitle": "ICLR 2018",
    "doi": "10.48550/arXiv.1712.02034",
    "id": "Goh-2018-SMILES2Vec",
    "journal": "",
    "keywords": "SMILES2Vec, SMILES, deep learning, molecular property prediction, interpretability, arXiv",
    "note": "arXiv:1712.02034",
    "number": "",
    "pages": "",
    "publisher": "",
    "title": "SMILES2Vec: An interpretable general-purpose deep neural network for predicting chemical properties",
    "type": "inproceedings",
    "url": "https://arxiv.org/abs/1712.02034",
    "volume": "",
    "year": 2017
  },
  {
    "abstract": "We describe an open-source toolkit for neural machine translation (NMT). The toolkit prioritizes efficiency, modularity, and extensibility with the goal of supporting NMT research into model architectures, feature representations, and source modalities, while maintaining competitive performance and reasonable training requirements. The toolkit consists of modeling and translation support, as well as detailed pedagogical documentation about the underlying techniques.",
    "author": "Klein, Guillaume; Kim, Yoon; Deng, Yuntian; Senellart, Jean; Rush, Alexander",
    "doi": "10.18653/v1/p17-4012",
    "id": "Klein-2017-OpenNMT",
    "journal": "Proceedings of ACL 2017, System Demonstrations",
    "keywords": "neural machine translation, open-source, OpenNMT, toolkit",
    "note": "Presented at ACL 2017, Vancouver, Canada",
    "pages": "67–72",
    "publisher": "Association for Computational Linguistics",
    "title": "OpenNMT: Open-source toolkit for neural machine translation",
    "type": "article",
    "url": "https://aclanthology.org/P17-4012/",
    "year": 2017
  },
  {
    "abstract": "Confidence calibration - the problem of predicting probability estimates representative of the true correctness likelihood - is important for classification models in many applications. We discover that modern neural networks, unlike those from a decade ago, are poorly calibrated. Through extensive experiments, we observe that depth, width, weight decay, and Batch Normalization are important factors influencing calibration. We evaluate the performance of various post-processing calibration methods on state-of-the-art architectures with image and document classification datasets. Our analysis and experiments not only offer insights into neural network learning, but also provide a simple and straightforward recipe for practical settings: on most datasets, temperature scaling - a single-parameter variant of Platt Scaling - is surprisingly effective at calibrating predictions.",
    "author": "Guo, Chuan; Pleiss, Geoff; Sun, Yu; Weinberger, Kilian Q.",
    "id": "Guo-2017-CalibrationNN",
    "journal": "Proceedings of the 34th International Conference on Machine Learning",
    "keywords": "neural networks, calibration, uncertainty estimation",
    "note": "No CrossRef DOI; ACM mirror DOI: 10.5555/3305381.3305518",
    "pages": "1321-1330",
    "publisher": "PMLR",
    "title": "Calibration of modern neural networks",
    "type": "article",
    "url": "https://proceedings.mlr.press/v70/guo17a/guo17a.pdf",
    "volume": "70",
    "year": 2017
  },
  {
    "abstract": "This dataset contains chemical reactions extracted by text-mining from United States patents published between 1976 and September 2016. Reactions are available in CML and reaction SMILES formats, derived using an enhanced version of previous extraction tools and LeadMine for chemical entity recognition. Atom-mapping was applied using the Indigo toolkit, though not always accurate. The dataset filters out invalid or trivial reactions based on structure and mapping rules. Duplication is common due to overlap between application and grant patents.",
    "author": "Lowe, Daniel",
    "doi": "10.6084/m9.figshare.5104873.v1",
    "id": "Lowe-2017-USPatents",
    "publisher": "figshare",
    "title": "Chemical reactions from US patents (1976–Sep2016)",
    "type": "dataset",
    "url": "https://doi.org/10.6084/m9.figshare.5104873.v1",
    "year": 2017
  },
  {
    "abstract": "Explores the use of SMILES enumeration as a data augmentation technique for improving the accuracy of neural network models for molecular property prediction.",
    "author": "Bjerrum, Esben Jannik",
    "doi": "10.48550/arXiv.1703.07076",
    "id": "Bjerrum-2017-SMILESEnumeration",
    "journal": "arXiv",
    "keywords": "SMILES enumeration, data augmentation, neural networks, molecular modeling, LSTM",
    "note": "",
    "number": "1703.07076",
    "title": "SMILES enumeration as data augmentation for neural network modeling of molecules",
    "type": "article",
    "url": "https://arxiv.org/abs/1703.07076",
    "year": 2017
  },
  {
    "abstract": "Three of the most fundamental questions in biology are how individual cells differentiate to form tissues, how tissues function in a coordinated and flexible fashion and which gene regulatory mechanisms support these processes. Single-cell genomics is opening up new ways to tackle these questions by combining the comprehensive nature of genomics with the microscopic resolution that is required to describe complex multicellular systems. Initial single-cell genomic studies provided a remarkably rich phenomenology of heterogeneous cellular states, but transforming observational studies into models of dynamics and causal mechanisms in tissues poses fresh challenges and requires stronger integration of theoretical, computational and experimental frameworks.",
    "author": "Tanay, Amos; Regev, Aviv",
    "doi": "10.1038/nature21350",
    "id": "Tanay-2017-ScalingSingleCell",
    "journal": "Nature",
    "keywords": "Single-cell genomics; Mechanistic biology; Profiling methods; Phenomenology",
    "number": "7637",
    "pages": "331–338",
    "title": "Scaling single-cell genomics from phenomenology to mechanism",
    "type": "article",
    "url": "https://doi.org/10.1038/nature21350",
    "volume": "541",
    "year": 2017
  },
  {
    "abstract": "Introduces a de novo peptide topology designed to assemble into antimicrobial capsids that destroy bacterial membranes. These artificial capsids exhibit broad-spectrum antimicrobial activity and are proposed as an alternative to traditional antibiotics.",
    "author": "De Santis, Emiliana; Alkassem, Hasan; Lamarre, Baptiste; Faruqui, Nilofar; Bella, Angelo; Noble, James E.; Micale, Nicola; Ray, Santanu; Burns, Jonathan R.; Yon, Alexander R.; Hoogenboom, Bart W.; Ryadnov, Maxim G.",
    "doi": "10.1038/s41467-017-02475-3",
    "id": "DeSantis-2017-AMPCapsids",
    "journal": "Nature Communications",
    "keywords": "antimicrobial peptides, de novo design, capsids, bacterial membrane disruption, peptide topology",
    "number": "2263",
    "title": "Antimicrobial peptide capsids of de novo design",
    "type": "article",
    "url": "https://doi.org/10.1038/s41467-017-02475-3",
    "volume": "8",
    "year": 2017
  },
  {
    "abstract": "The dominant sequence transduction models are based on complex recurrent or convolutional neural networks in an encoder-decoder configuration. The best performing models also connect the encoder and decoder through an attention mechanism. We propose a new simple network architecture, the Transformer, based solely on attention mechanisms, dispensing with recurrence and convolutions entirely. Experiments on two machine translation tasks show these models to be superior in quality while being more parallelizable and requiring significantly less time to train. Our model achieves 28.4 BLEU on the WMT 2014 English-to-German translation task, improving over the existing best results, including ensembles by over 2 BLEU. On the WMT 2014 English-to-French translation task, our model establishes a new single-model state-of-the-art BLEU score of 41.8 after training for 3.5 days on eight GPUs, a small fraction of the training costs of the best models from the literature. We show that the Transformer generalizes well to other tasks by applying it successfully to English constituency parsing both with large and limited training data.",
    "author": "Vaswani, Ashish; Shazeer, Noam; Parmar, Niki; Uszkoreit, Jakob; Jones, Llion; Gomez, Aidan N.; Kaiser, Łukasz; Polosukhin, Illia",
    "doi": "10.48550/arXiv.1706.03762",
    "id": "Vaswani-2017-AttentionIsAll",
    "journal": "Advances in Neural Information Processing Systems",
    "keywords": "Transformer, attention mechanism, sequence transduction, deep learning, machine translation",
    "pages": "5998-6008",
    "title": "Attention Is All You Need",
    "type": "article",
    "url": "https://arxiv.org/abs/1706.03762",
    "volume": "30",
    "year": 2017
  },
  {
    "abstract": "We present “Ask Ernö”, a self-learning system for the automatic analysis of NMR spectra, consisting of integrated chemical shift assignment and prediction tools. The output of the automatic assignment component initializes and improves a database of assigned protons that is used by the chemical shift predictor. In turn, the predictions provided by the latter facilitate improvement of the assignment process. Iteration on these steps allows Ask Ernö to improve its ability to assign and predict spectra without any prior knowledge or assistance from human experts. This concept was tested by training such a system with a dataset of 2341 molecules and their 1H-NMR spectra, and evaluating the accuracy of chemical shift predictions on a test set of 298 partially assigned molecules (2007 assigned protons). After 10 iterations, Ask Ernö was able to decrease its prediction error by 17 %, reaching an average error of 0.265 ppm. Over 60 % of the test chemical shifts were predicted within 0.2 ppm, while only 5 % still presented a prediction error of more than 1 ppm. Ask Ernö introduces an innovative approach to automatic NMR analysis that constantly learns and improves when provided with new data. Furthermore, it completely avoids the need for manually assigned spectra. This system has the potential to be turned into a fully autonomous tool able to compete with the best alternatives currently available.",
    "author": "Castillo, Andrés M.; Bernal, Andrés; Dieden, Reiner; Patiny, Luc; Wist, Julien",
    "doi": "10.1186/s13321-016-0134-6",
    "id": "Castillo-2016-AskErno",
    "journal": "Journal of Cheminformatics",
    "pages": "Article number: 26",
    "title": "“Ask Ernö”: a self-learning tool for assignment and prediction of nuclear magnetic resonance spectra",
    "type": "article",
    "url": "https://jcheminf.biomedcentral.com/articles/10.1186/s13321-016-0134-6",
    "volume": "8",
    "year": 2016
  },
  {
    "author": "Claridge, Timothy D. W.",
    "doi": "10.1016/c2015-0-04654-8",
    "id": "Claridge-2016-HighResNMR",
    "keywords": "NMR spectroscopy; 1H NMR; 13C NMR; HSQC; 2D NMR; organic structure elucidation; pulse sequences; spectral interpretation",
    "note": "3rd edition; Tetrahedron Organic Chemistry Series",
    "publisher": "Elsevier",
    "title": "High-Resolution NMR Techniques in Organic Chemistry",
    "type": "book",
    "url": "https://doi.org/10.1016/c2015-0-04654-8",
    "year": 2016
  },
  {
    "abstract": "Neural machine translation (NMT) models typically operate with a fixed vocabulary, but translation is an open-vocabulary problem. Previous work addresses the translation of out-of-vocabulary words by backing off to a dictionary. In this paper, we introduce a simpler and more effective approach, making the NMT model capable of open-vocabulary translation by encoding rare and unknown words as sequences of subword units. This is based on the intuition that various word classes are translatable via smaller units than words, for instance names (via character copying or transliteration), compounds (via compositional translation), and cognates and loanwords (via phonological and morphological transformations). We discuss the suitability of different word segmentation techniques, including simple character n-gram models and a segmentation based on the byte pair encoding compression algorithm, and empirically show that subword models improve over a back-off dictionary baseline for the WMT 15 translation tasks English-German and English-Russian by 1.1 and 1.3 BLEU, respectively.",
    "author": "Sennrich, Rico; Haddow, Barry; Birch, Alexandra",
    "doi": "10.18653/v1/P16-1162",
    "id": "Sennrich-2016-SubwordNMT",
    "journal": "Proceedings of the 54th Annual Meeting of the Association for Computational Linguistics (ACL 2016)",
    "keywords": "byte pair encoding, BPE, neural machine translation, subword segmentation, rare words",
    "pages": "1715–1725",
    "title": "Neural machine translation of rare words with subword units",
    "type": "article",
    "url": "https://aclanthology.org/P16-1162",
    "year": 2016
  },
  {
    "abstract": "This paper presents DeepRED, a decompositional algorithm capable of extracting comprehensible rules from deep neural networks (DNNs). Unlike prior methods that focus on shallow networks, DeepRED recursively applies decision tree induction across DNN layers and merges intermediate rules. The evaluation across real and synthetic datasets demonstrates its ability to outperform pedagogical approaches, especially on complex problems like XOR.",
    "author": "Zilke, Jan Ruben; Loza Mencía, Eneldo; Janssen, Frederik",
    "booktitle": "Discovery Science (DS 2016)",
    "doi": "10.1007/978-3-319-46307-0_29",
    "id": "Zilke-2016-DeepRED",
    "journal": "Lecture Notes in Artificial Intelligence (LNAI)",
    "keywords": "rule extraction; deep neural networks; interpretability; decompositional algorithm; DeepRED",
    "pages": "457–473",
    "publisher": "Springer International Publishing",
    "title": "DeepRED – Rule extraction from deep neural networks",
    "type": "article",
    "url": "https://doi.org/10.1007/978-3-319-46307-0_29",
    "volume": "9956",
    "year": 2016
  },
  {
    "abstract": "Neural machine translation (NMT) offers a novel alternative formulation of translation that is potentially simpler than statistical approaches. However to reach competitive performance, NMT models need to be exceedingly large. In this paper we consider applying knowledge distillation approaches (Bucila et al., 2006; Hinton et al., 2015) that have proven successful for reducing the size of neural models in other domains to the problem of NMT. We demonstrate that standard knowledge distillation applied to word-level prediction can be effective for NMT, and also introduce two novel sequence-level versions of knowledge distillation that further improve performance, and somewhat surprisingly, seem to eliminate the need for beam search (even when applied on the original teacher model). Our best student model runs 10 times faster than its state-of-the-art teacher with little loss in performance. It is also significantly better than a baseline model trained without knowledge distillation: by 4.2/1.7 BLEU with greedy decoding/beam search. Applying weight pruning on top of knowledge distillation results in a student model that has 13 times fewer parameters than the original teacher model, with a decrease of 0.4 BLEU.",
    "author": "Kim, Yoon; Rush, Alexander M.",
    "doi": "10.18653/v1/D16-1139",
    "id": "Kim-2016-Sequence",
    "journal": "Proceedings of the 2016 Conference on Empirical Methods in Natural Language Processing (EMNLP)",
    "pages": "1317–1327",
    "title": "Sequence-Level Knowledge Distillation",
    "type": "inproceedings",
    "url": "https://www.aclweb.org/anthology/D16-1139.pdf",
    "year": 2016
  },
  {
    "abstract": "This paper deals with the problem of quantifying the impact of model misspecification when computing general expected values of interest. The methodology that we propose is applicable in great generality, in particular, we provide examples involving path-dependent expectations of stochastic processes. Our approach consists in computing bounds for the expectation of interest regardless of the probability measure used, as long as the measure lies within a prescribed tolerance measured in terms of a flexible class of distances from a suitable baseline model. These distances, based on optimal transportation between probability measures, include Wasserstein’s distances as particular cases. The proposed methodology is well-suited for risk analysis, as we demonstrate with a number of applications. We also discuss how to estimate the tolerance region non-parametrically using Skorokhod-type embeddings in some of these applications.",
    "author": "Blanchet, Jose; Murthy, Karthyek",
    "doi": "10.2139/ssrn.2759640",
    "id": "Blanchet-2016-OTDRO",
    "journal": "Risk Management eJournal (SSRN)",
    "keywords": "Model risk, distributional robustness, transport metric, Wasserstein distances, duality, Kullback-Liebler divergences, ruin probabilities, diffusion approximations",
    "pages": "32",
    "title": "Quantifying distributional model risk via optimal transport",
    "type": "article",
    "url": "https://ssrn.com/abstract=2759640",
    "year": 2016
  },
  {
    "abstract": "The aim of this paper is to provide a new method for learning the relationships between data that have been obtained independently. Unlike existing methods like matching, the proposed technique does not require any contextual information, provided that the dependency between the variables of interest is monotone. It can therefore be easily combined with matching in order to exploit the advantages of both methods. This technique can be described as a mix between quantile matching and deconvolution. We provide for it a theoretical and an empirical validation.",
    "author": "Carpentier, Alexandra; Schlüter, Teresa",
    "booktitle": "19th International Conference on Artificial Intelligence and Statistics (AISTATS)",
    "id": "Carpentier-2016-IndependentData",
    "keywords": "Distribution-to-Distribution Regression; Uncoupled Data Regression; Probabilistic Regression; Scalable Machine Learning; Privacy-Preserving Regression",
    "pages": "658–666",
    "publisher": "PMLR",
    "title": "Learning relationships between data obtained independently",
    "type": "conference",
    "url": "https://proceedings.mlr.press/v51/carpentier16b.html",
    "year": 2016
  },
  {
    "abstract": "This paper introduces RaptorX-Property, a web server that predicts protein structure properties, including secondary structure, solvent accessibility, and disorder regions, using deep learning techniques.",
    "author": "Wang, Sheng; Peng, Jian; Ma, Jianzhu; Xu, Jinbo",
    "doi": "10.1093/nar/gkw306",
    "id": "Wang-2016-RaptorX",
    "journal": "Nucleic Acids Research",
    "keywords": "RaptorX, Protein structure prediction, Secondary structure, Solvent accessibility, Disorder regions, Deep learning",
    "number": "W1",
    "pages": "W430–W435",
    "title": "RaptorX-Property: a web server for protein structure property prediction",
    "type": "article",
    "volume": "44",
    "year": 2016
  },
  {
    "abstract": "nmrshiftdb2 supports with its laboratory information management system the integration of an electronic lab administration and management into academic NMR facilities. Also, it offers the setup of a local database, while full access to nmrshiftdb2's World Wide Web database is granted. This freely available system allows on the one hand the submission of orders for measurement, transfers recorded data automatically or manually, and enables download of spectra via web interface, as well as the integrated access to prediction, search, and assignment tools of the NMR database for lab users. On the other hand, for the staff and lab administration, flow of all orders can be supervised; administrative tools also include user and hardware management, a statistic functionality for accounting purposes, and a ‘QuickCheck’ function for assignment control, to facilitate quality control of assignments submitted to the (local) database. Laboratory information management system and database are based on a web interface as front end and are therefore independent of the operating system in use.",
    "author": "Kuhn, Stefan; Schlörer, Nils E.",
    "doi": "10.1002/mrc.4263",
    "id": "Kuhn-2015-NMRShiftDB2",
    "journal": "Magnetic Resonance in Chemistry",
    "number": "8",
    "pages": "582–589",
    "title": "Facilitating quality control for spectra assignments of small organic molecules: nmrshiftdb2 – a free in-house NMR database with integrated LIMS for academic service laboratories",
    "type": "article",
    "url": "https://doi.org/10.1002/mrc.4263",
    "volume": "53",
    "year": 2015
  },
  {
    "abstract": "Finding a canonical ordering of the atoms in a molecule is a prerequisite for generating a unique representation of the molecule. The canonicalization of a molecule is usually accomplished by applying some sort of graph relaxation algorithm, the most common of which is the Morgan algorithm. There are known issues with that algorithm that lead to noncanonical atom orderings as well as problems when it is applied to large molecules like proteins. Furthermore, each cheminformatics toolkit or software provides its own version of a canonical ordering, most based on unpublished algorithms, which also complicates the generation of a universal unique identifier for molecules. We present an alternative canonicalization approach that uses a standard stable-sorting algorithm instead of a Morgan-like index. Two new invariants that allow canonical ordering of molecules with dependent chirality as well as those with highly symmetrical cyclic graphs have been developed. The new approach proved to be robust and fast when tested on the 1.45 million compounds of the ChEMBL 20 data set in different scenarios like random renumbering of input atoms or SMILES round tripping. Our new algorithm is able to generate a canonical order of the atoms of protein molecules within a few milliseconds. The novel algorithm is implemented in the open-source cheminformatics toolkit RDKit. With this paper, we provide a reference Python implementation of the algorithm that could easily be integrated in any cheminformatics toolkit. This provides a first step toward a common standard for canonical atom ordering to generate a universal unique identifier for molecules other than InChI.",
    "author": "Schneider, Nadine; Sayle, Roger A.; Landrum, Gregory A.",
    "doi": "10.1021/acs.jcim.5b00543",
    "id": "Schneider-2015-Canonicalization",
    "journal": "Journal of Chemical Information and Modeling",
    "number": "10",
    "pages": "2111–2120",
    "title": "Get your atoms in order—An open-source implementation of a novel and robust molecular canonicalization algorithm",
    "type": "article",
    "url": "https://doi.org/10.1021/acs.jcim.5b00543",
    "volume": "55",
    "year": 2015
  },
  {
    "abstract": "Cheminformaticians are equipped with a very rich toolbox when carrying out molecular similarity calculations. A large number of molecular representations exist, and there are several methods (similarity and distance metrics) to quantify the similarity of molecular representations. In this work, eight well-known similarity/distance metrics are compared on a large dataset of molecular fingerprints with sum of ranking differences (SRD) and ANOVA analysis. The effects of molecular size, selection methods and data pretreatment methods on the outcome of the comparison are also assessed. A supplier database (https://mcule.com/) was used as the source of compounds. Similarity metrics were compared by ranking compounds using SRD. Tanimoto, Dice, Cosine, and Soergel were best performers. Euclidean and Manhattan were not recommended unless used for diversity.",
    "author": "Bajusz, Dávid; Rácz, Anita; Héberger, Károly",
    "doi": "10.1186/s13321-015-0069-3",
    "id": "Bajusz-2015-Tanimoto",
    "journal": "Journal of Cheminformatics",
    "keywords": "Tanimoto index, molecular similarity, cheminformatics, similarity metrics, SRD, ANOVA",
    "pages": "Article 20",
    "title": "Why is Tanimoto index an appropriate choice for fingerprint-based similarity calculations?",
    "type": "article",
    "url": "https://jcheminf.biomedcentral.com/articles/10.1186/s13321-015-0069-3",
    "volume": "7",
    "year": 2015
  },
  {
    "abstract": "Metabolites provide a direct functional signature of cellular state. Untargeted metabolomics experiments usually rely on tandem MS to identify the thousands of compounds in a biological sample. Today, the vast majority of metabolites remain unknown. We present a method for searching molecular structure databases using tandem MS data of small molecules. Our method computes a fragmentation tree that best explains the fragmentation spectrum of an unknown molecule. We use the fragmentation tree to predict the molecular structure fingerprint of the unknown compound using machine learning. This fingerprint is then used to search a molecular structure database such as PubChem. Our method is shown to improve on the competing methods for computational metabolite identification by a considerable margin.",
    "author": "Dührkop, Kai; Shen, Huibin; Meusel, Marvin; Rousu, Juho; Böcker, Sebastian",
    "doi": "10.1073/pnas.1509788112",
    "id": "Dührkop-2015-CSI",
    "journal": "Proceedings of the National Academy of Sciences",
    "number": "41",
    "pages": "12580–12585",
    "title": "Searching molecular structure databases with tandem mass spectra using CSI:FingerID",
    "type": "article",
    "url": "https://www.pnas.org/doi/10.1073/pnas.1509788112",
    "volume": "112",
    "year": 2015
  },
  {
    "abstract": "We propose a deep convolutional neural network architecture codenamed Inception that achieves the new state of the art for classification and detection in the ImageNet Large-Scale Visual Recognition Challenge 2014 (ILSVRC14). The main hallmark of this architecture is the improved utilization of the computing resources inside the network. By a carefully crafted design, we increased the depth and width of the network while keeping the computational budget constant. To optimize quality, the architectural decisions were based on the Hebbian principle and the intuition of multi-scale processing. One particular incarnation used in our submission for ILSVRC14 is called GoogLeNet, a 22 layers deep network, the quality of which is assessed in the context of classification and detection.",
    "author": "Szegedy, Christian; Liu, Wei; Jia, Yangqing; Sermanet, Pierre; Reed, Scott; Anguelov, Dragomir; Erhan, Dumitru; Vanhoucke, Vincent; Rabinovich, Andrew",
    "doi": "10.1109/cvpr.2015.7298594",
    "id": "Szegedy-2015-GoingDeeper",
    "journal": "2015 IEEE Conference on Computer Vision and Pattern Recognition (CVPR)",
    "pages": "1-9",
    "publisher": "IEEE",
    "title": "Going deeper with convolutions",
    "type": "proceedings-article",
    "url": "https://doi.org/10.1109/cvpr.2015.7298594",
    "year": 2015
  },
  {
    "abstract": "Stochastic variational inference enables rapid approximation of posterior distributions induced by large datasets through stochastic optimization. However, it traditionally relies on fully factorized variational distributions, known as the 'mean-field' independence approximation, which can limit the fidelity of the posterior approximation and introduce local optima. This paper presents a method to relax the mean-field approximation, allowing for arbitrary dependencies between global parameters and local hidden variables. This approach aims to produce better parameter estimates by reducing bias, sensitivity to local optima, and sensitivity to hyperparameters.",
    "author": "Hoffman, Matthew D.; Blei, David M.",
    "doi": "",
    "id": "Hoffman-2015-StructuredSVI",
    "journal": "Proceedings of the International Conference on Artificial Intelligence and Statistics (AISTATS)",
    "keywords": "Stochastic Variational Inference; Mean-Field Approximation; Posterior Approximation; Stochastic Optimization; Variational Inference",
    "number": "",
    "pages": "",
    "publisher": "",
    "title": "Structured Stochastic Variational Inference",
    "type": "conference",
    "url": "https://arxiv.org/pdf/1404.4114v3.pdf",
    "volume": "",
    "year": 2015
  },
  {
    "abstract": "A very simple way to improve the performance of almost any machine learning algorithm is to train many different models on the same data and then to average their predictions. Unfortunately, making predictions using a whole ensemble of models is cumbersome and may be too computationally expensive to allow deployment to a large number of users, especially if the individual models are large neural nets. Caruana and his collaborators have shown that it is possible to compress the knowledge in an ensemble into a single model which is much easier to deploy and we develop this approach further using a different compression technique. We achieve some surprising results on MNIST and we show that we can significantly improve the acoustic model of a heavily used commercial system by distilling the knowledge in an ensemble of models into a single model. We also introduce a new type of ensemble composed of one or more full models and many specialist models which learn to distinguish fine-grained classes that the full models confuse. Unlike a mixture of experts, these specialist models can be trained rapidly and in parallel.",
    "author": "Hinton, Geoffrey; Vinyals, Oriol; Dean, Jeff",
    "doi": "10.48550/arXiv.1503.02531",
    "id": "Hinton-2014-Knowledge",
    "journal": "NIPS 2014 Deep Learning Workshop",
    "title": "Distilling the knowledge in a neural network",
    "type": "article",
    "url": "https://arxiv.org/abs/1503.02531",
    "year": 2014
  },
  {
    "abstract": "This paper presents a unified framework for estimating species diversity using Hill numbers (qD) that integrates rarefaction (interpolation) and extrapolation. It extends earlier models for species richness (q = 0) to q > 0, allowing estimation from both abundance and incidence data. Analytical estimators and confidence intervals are developed for q = 0 (species richness), q = 1 (Shannon diversity), and q = 2 (Simpson diversity). Performance is evaluated using simulated and empirical datasets.",
    "author": "Chao, Anne; Gotelli, Nicholas J.; Hsieh, T. C.; Sander, E. L.; Ma, K. H.; Colwell, Robert K.; Ellison, Aaron M.",
    "doi": "10.1890/13-0133.1",
    "id": "Chao-2014-HillNumbers",
    "journal": "Ecological Monographs",
    "keywords": "Hill numbers, species richness, rarefaction, extrapolation, diversity estimation, bootstrap",
    "number": "1",
    "pages": "45–67",
    "title": "Rarefaction and extrapolation with Hill numbers: a framework for sampling and estimation in species diversity studies",
    "type": "article",
    "url": "https://doi.org/10.1890/13-0133.1",
    "volume": "84",
    "year": 2014
  },
  {
    "abstract": "Optimal transport distances have been used for more than a decade in machine learning to compare histograms of features. They have one parameter: the ground metric, which can be any metric between the features themselves. As is the case for all parameterized distances, optimal transport distances can only prove useful in practice when this parameter is carefully chosen. To date, the only option available to practitioners to set the ground metric parameter was to rely on a priori knowledge of the features, which limited considerably the scope of application of optimal transport distances. We propose to lift this limitation and consider instead algorithms that can learn the ground metric using only a training set of labeled histograms. We call this approach ground metric learning. We formulate the problem of learning the ground metric as the minimization of the difference of two convex polyhedral functions over a convex set of metric matrices. We follow the presentation of our algorithms with promising experimental results which show that this approach is useful both for retrieval and binary/multiclass classification tasks.",
    "author": "Cuturi, Marco; Avis, David",
    "id": "Cuturi-2014-GroundMetric",
    "journal": "Journal of Machine Learning Research",
    "number": "17",
    "pages": "533–564",
    "title": "Ground metric learning",
    "type": "article",
    "url": "https://jmlr.org/papers/v15/cuturi14a.html",
    "volume": "15",
    "year": 2014
  },
  {
    "abstract": "This article provides a comprehensive survey of the Schrödinger problem and its relationship with optimal transport. It explores mathematical foundations, connections to entropy minimization, and computational implications in various applied contexts.",
    "author": "Léonard, Christian",
    "doi": "10.3934/dcds.2014.34.1533",
    "id": "Leonard-2014-SchrodingerSurvey",
    "journal": "Discrete and Continuous Dynamical Systems - A",
    "keywords": "Schrödinger Problem, Optimal Transport, Entropy Minimization, Mathematical Foundations",
    "pages": "1533-1574",
    "title": "A survey of the Schrödinger problem and some of its connections with optimal transport",
    "type": "article",
    "url": "https://doi.org/10.3934/dcds.2014.34.1533",
    "volume": "34",
    "year": 2014
  },
  {
    "abstract": "RDKit is an open-source cheminformatics toolkit widely used in molecular modeling, drug discovery, and computational chemistry. It provides tools for molecule representation, substructure searching, descriptor calculation, and integration with machine learning workflows. Although not formally published in a peer-reviewed journal, RDKit is a foundational tool in the cheminformatics community.",
    "author": "Landrum, Greg",
    "id": "Landrum-2013-RDKit",
    "keywords": "cheminformatics, open-source, molecular modeling, RDKit, software, computational chemistry",
    "note": "[Online; accessed 04-July-2025]",
    "title": "RDKit: Open-source cheminformatics",
    "type": "online",
    "url": "https://www.rdkit.org",
    "year": 2013
  },
  {
    "abstract": "Optimal transportation distances are a fundamental family of parameterized distances for histograms. Despite their appealing theoretical properties, excellent performance in retrieval tasks and intuitive formulation, their computation involves the resolution of a linear program whose cost is prohibitive whenever the histograms' dimension exceeds a few hundreds. We propose in this work a new family of optimal transportation distances that look at transportation problems from a maximum-entropy perspective. We smooth the classical optimal transportation problem with an entropic regularization term, and show that the resulting optimum is also a distance which can be computed through Sinkhorn-Knopp's matrix scaling algorithm at a speed that is several orders of magnitude faster than that of transportation solvers. We also report improved performance over classical optimal transportation distances on the MNIST benchmark problem.",
    "author": "Cuturi, Marco",
    "doi": "",
    "id": "Cuturi-2013-Sinkhorn",
    "journal": "Advances in Neural Information Processing Systems",
    "pages": "2292-2300",
    "title": "Sinkhorn distances: Lightspeed computation of optimal transport",
    "type": "article",
    "url": "https://arxiv.org/abs/1306.0895",
    "year": 2013
  },
  {
    "abstract": "This study explores cell fate decisions in differentiation and reprogramming using an energy landscape approach. It characterizes stability and transitions of cell states by quantifying the topography of the energy landscape. The results provide insights into stem cell behaviors, stochastic transitions, and reprogramming efficiency, offering a framework to understand cell state dynamics under various perturbations.",
    "author": "Li, Chunhe; Wang, Jin",
    "doi": "10.1371/journal.pcbi.1003165",
    "id": "Li-2013-HumanStemCells",
    "journal": "PLOS Computational Biology",
    "number": "8",
    "pages": "e1003165",
    "title": "Quantifying Cell Fate Decisions for Differentiation and Reprogramming of a Human Stem Cell Network: Landscape and Biological Paths",
    "type": "article",
    "url": "https://doi.org/10.1371/journal.pcbi.1003165",
    "volume": "9",
    "year": 2013
  },
  {
    "abstract": "This review discusses the global status of antibiotic resistance, its major causes and consequences, and identifies key areas where urgent action is needed.",
    "author": "Laxminarayan, Ramanan; Duse, Adriano; Wattal, Chand; Zaidi, Anita K.M.; Wertheim, Heiman F.L.; Sumpradit, Nithima; Vlieghe, Erika; Levy Hara, Gabriel; Gould, Ian M.; Goossens, Herman; Greko, Christina; So, Anthony D.; Bigdeli, Maryam; Tomson, Göran; Woodhouse, Will; Ombaka, Eva; Peralta, Arturo Q.; Qamar, Farah N.; Mir, Fatima; Kariuki, Sam; Bhutta, Zulfiqar A.; Coates, Anthony; Bergstrom, Richard; Wright, Gerard D.; Brown, Eric D.; Cars, Otto",
    "doi": "10.1016/S1473-3099(13)70318-9",
    "id": "Laxminarayan-2013-GlobalSolutions",
    "journal": "The Lancet Infectious Diseases",
    "number": "12",
    "pages": "1057-1098",
    "publisher": "Elsevier",
    "title": "Antibiotic resistance—the need for global solutions",
    "type": "article",
    "url": "https://doi.org/10.1016/S1473-3099(13)70318-9",
    "volume": "13",
    "year": 2013
  },
  {
    "abstract": "Stochastic Variational Inference (SVI) is a scalable algorithm for fitting probabilistic models to massive datasets. SVI uses stochastic optimization to maximize a variational objective, optimizing global and local variables alternately. This paper applies SVI to latent Dirichlet allocation (LDA) and extensions.",
    "author": "Hoffman, Matthew D.; Blei, David M.; Wang, Chong; Paisley, John",
    "doi": "10.5555/2567709.2567734",
    "id": "Hoffman-2013-SVI",
    "journal": "Journal of Machine Learning Research",
    "keywords": "Stochastic Variational Inference; Variational Objective; Latent Dirichlet Allocation",
    "number": "",
    "pages": "1303-1347",
    "publisher": "",
    "title": "Stochastic Variational Inference",
    "type": "article",
    "url": "https://jmlr.org/papers/volume14/hoffman13a/hoffman13a.pdf",
    "volume": "14",
    "year": 2013
  },
  {
    "abstract": "ZINC is a free public resource for ligand discovery, containing over twenty million commercially available molecules in biologically relevant formats. The database supports structure- and property-based searches, subset creation, and vendor-ready exports, and is maintained for high purchasing success. It is freely accessible at zinc.docking.org.",
    "author": "Irwin, John J.; Sterling, Teague; Mysinger, Michael M.; Bolstad, Erin S.; Coleman, Ryan G.",
    "doi": "10.1021/ci3001277",
    "id": "Irwin-2012-ZINC",
    "journal": "Journal of Chemical Information and Modeling",
    "keywords": "ligand discovery; virtual screening; docking; chemical databases; small molecules; ZINC database",
    "number": "7",
    "pages": "1757–1768",
    "title": "ZINC: a free tool to discover chemistry for biology",
    "type": "article",
    "url": "https://doi.org/10.1021/ci3001277",
    "volume": "52",
    "year": 2012
  },
  {
    "author": "Watrous, Jeramie; Roach, Patrick; Alexandrov, Theodore; Heath, Brandi S.; Yang, Jane Y.; Kersten, Roland D.; van der Voort, Menno; Pogliano, Kit; Gross, Harald; Raaijmakers, Jos M.; Moore, Bradley S.; Laskin, Julia; Bandeira, Nuno; Dorrestein, Pieter C.",
    "doi": "10.1073/pnas.1203689109",
    "id": "Watrous-2012-MolecularNetworking",
    "journal": "Proceedings of the National Academy of Sciences",
    "number": "26",
    "pages": "E1743–E1752",
    "title": "Mass spectral molecular networking of living microbial colonies",
    "type": "article",
    "url": "https://doi.org/10.1073/pnas.1203689109",
    "volume": "109",
    "year": 2012
  },
  {
    "abstract": "This study examines a quasi-potential framework to explore the dynamics and stability of gene regulatory networks, particularly focusing on metastability and transitions between cellular states.",
    "author": "Wang, Yong; Xu, Lili; Wang, Jin",
    "doi": "10.1063/1.3683930",
    "id": "Wang-2012-QuasiPotential",
    "journal": "Journal of Chemical Physics",
    "keywords": "Quasi-potential, gene regulatory networks, metastability, dynamics",
    "number": "3",
    "pages": "154109",
    "publisher": "American Institute of Physics",
    "title": "Quasi-potential landscape framework for dynamics in gene regulatory networks",
    "type": "article",
    "url": "https://doi.org/10.1063/1.3683930",
    "volume": "136",
    "year": 2012
  },
  {
    "abstract": "This study evaluates inter-residue contact definitions and their impact on protein fold recognition. By developing effective contact definitions, the authors significantly improve the accuracy of protein fold recognition models.",
    "author": "Yuan, Chao; Chen, Hao; Kihara, Daisuke",
    "doi": "10.1186/1471-2105-13-292",
    "id": "Yuan-2012-InterResidueContact",
    "journal": "BMC Bioinformatics",
    "keywords": "protein fold recognition, inter-residue contact, computational biology",
    "number": "1",
    "pages": "292",
    "title": "Effective inter-residue contact definitions for accurate protein fold recognition",
    "type": "article",
    "url": "https://bmcbioinformatics.biomedcentral.com/articles/10.1186/1471-2105-13-292",
    "volume": "13",
    "year": 2012
  },
  {
    "author": "Pedregosa, Fabian; Varoquaux, Gaël; Gramfort, Alexandre; Michel, Vincent; Thirion, Bertrand; Grisel, Olivier; Blondel, Mathieu; Prettenhofer, Peter; Weiss, Ron; Dubourg, Vincent; Vanderplas, Jake; Passos, Alexandre; Cournapeau, David; Brucher, Matthieu; Perrot, Matthieu; Duchesnay, Édouard",
    "id": "Pedregosa-2011-ScikitLearn",
    "journal": "Journal of Machine Learning Research",
    "keywords": "scikit-learn, machine learning, Python, API, documentation, library",
    "note": "Accessed July 2025. General documentation and foundational description of the scikit-learn library.",
    "pages": "2825–2830",
    "title": "Scikit-learn: Machine learning in Python",
    "type": "article",
    "url": "https://scikit-learn.org/stable/",
    "volume": "12",
    "year": 2011
  },
  {
    "abstract": "Open Babel is an open-source chemical toolbox that facilitates interconversion between over 110 chemical file formats. The toolkit also includes cheminformatics capabilities such as partial charge assignment, aromaticity detection, substructure and similarity searching, and 3D conformer generation. This article outlines the design and features of Open Babel 2.3 and highlights its use in drug discovery, materials science, and computational chemistry.",
    "author": "O'Boyle, Noel M.; Banck, Michael; James, Craig A.; Morley, Chris; Vandermeersch, Tim; Hutchison, Geoffrey R.",
    "doi": "10.1186/1758-2946-3-33",
    "id": "OBoyle-2011-OpenBabel",
    "journal": "Journal of Cheminformatics",
    "keywords": "cheminformatics, open-source, file format conversion, Open Babel, molecular toolkit, substructure search",
    "pages": "33",
    "title": "Open Babel: An open chemical toolbox",
    "type": "article",
    "url": "https://doi.org/10.1186/1758-2946-3-33",
    "volume": "3",
    "year": 2011
  },
  {
    "abstract": "Quantifying the Waddington landscape and associated biological paths for cell development and differentiation has long been an elusive goal in understanding the mechanisms of cellular processes. This paper provides a theoretical and computational framework for mapping the Waddington landscape and calculating transition paths between cell states. The results reveal the landscape topography and the effects of barrier heights on cell differentiation stability. The study further addresses reverse differentiation and reprogramming within the landscape context, revealing how perturbations can shift cellular dynamics.",
    "author": "Wang, Jin; Zhang, Kun; Xu, Li; Wang, Erkang",
    "doi": "10.1073/pnas.1017017108",
    "id": "Wang-2011-WaddingtonLandscape",
    "journal": "Proceedings of the National Academy of Sciences",
    "number": "20",
    "pages": "8257-8262",
    "title": "Quantifying the Waddington landscape and biological paths for development and differentiation",
    "type": "article",
    "url": "https://doi.org/10.1073/pnas.1017017108",
    "volume": "108",
    "year": 2011
  },
  {
    "abstract": "A textbook aimed at graduate students and researchers, providing a deep theoretical and practical understanding of high-resolution NMR spectroscopy techniques, including 1D and 2D methods such as COSY and HSQC. Emphasis on quantum mechanics and pulse sequence logic.",
    "author": "Keeler, James",
    "id": "Keeler-2010-UnderstandingNMR",
    "keywords": "NMR spectroscopy; 1H NMR; 13C NMR; HSQC; 2D NMR; product operators; spin dynamics; quantum mechanics; spectral interpretation",
    "note": "2nd edition",
    "publisher": "John Wiley \\& Sons",
    "title": "Understanding NMR spectroscopy",
    "type": "book",
    "url": "https://www.wiley.com/en-us/Understanding+NMR+Spectroscopy%2C+2nd+Edition-p-9780470746097",
    "year": 2010
  },
  {
    "abstract": "Studying the cell cycle process is crucial for understanding cell growth, proliferation, development, and death. We uncovered some key factors in determining the global robustness and function of the budding yeast cell cycle by exploring the underlying landscape and flux of this nonequilibrium network. The dynamics of the system is determined by both the landscape which attracts the system down to the oscillation orbit and the curl flux which drives the periodic motion on the ring. This global structure of landscape is crucial for the coherent cell cycle dynamics and function. The topography of the underlying landscape, specifically the barrier height separating basins of attractions, characterizes the capability of changing from one part of the system to another. This quantifies the stability and robustness of the system. We studied how barrier height is influenced by environmental fluctuations and perturbations on specific wirings of the cell cycle network. When the fluctuations increase, the barrier height decreases and the period and amplitude of cell cycle oscillation is more dispersed and less coherent. The corresponding dissipation of the system quantitatively measured by the entropy production rate increases. This implies that the system is less stable under fluctuations. We identified some key structural elements for wirings of the cell cycle network responsible for the change of the barrier height and therefore the global stability of the system through the sensitivity analysis. The results are in agreement with recent experiments and also provide new predictions.",
    "author": "Wang, Jin; Li, Chunhe; Wang, Erkang",
    "doi": "10.1073/pnas.0910331107",
    "id": "Wang-2010-BuddingYeast",
    "journal": "Proceedings of the National Academy of Sciences",
    "number": "18",
    "pages": "8195-8200",
    "title": "Potential and flux landscapes quantify the stability and robustness of budding yeast cell cycle network",
    "type": "article",
    "url": "https://doi.org/10.1073/pnas.0910331107",
    "volume": "107",
    "year": 2010
  },
  {
    "abstract": "An overview of flux balance analysis (FBA), a mathematical approach for analyzing the flow of metabolites through a metabolic network.",
    "author": "Orth, Jeffrey D.; Thiele, Ines; Palsson, Bernhard Ø.",
    "doi": "10.1038/nbt.1614",
    "id": "Orth-2010-FBA",
    "journal": "Nature Biotechnology",
    "keywords": "flux balance analysis, FBA, systems biology, metabolic networks",
    "number": "3",
    "pages": "245-248",
    "publisher": "Springer Science and Business Media LLC",
    "title": "What is flux balance analysis?",
    "type": "article",
    "url": "https://doi.org/10.1038/nbt.1614",
    "volume": "28",
    "year": 2010
  },
  {
    "abstract": "Discusses the design and activity of antimicrobial peptides with dual-target properties. These peptides exhibit targeted killing of specific pathogens, demonstrating potential applications in precision antimicrobial therapy.",
    "author": "He, Jian; Anderson, Maxwell H.; Shi, Wenyuan; Eckert, Randal",
    "doi": "10.1016/j.ijantimicag.2008.11.013",
    "id": "He-2009-DualTargetAMP",
    "journal": "International Journal of Antimicrobial Agents",
    "keywords": "antimicrobial peptides, dual-targeting, peptide design, precision therapy, antimicrobial agents",
    "number": "6",
    "title": "Design and activity of a ‘dual-targeted’ antimicrobial peptide",
    "type": "article",
    "url": "https://doi.org/10.1016/j.ijantimicag.2008.11.013",
    "volume": "33",
    "year": 2009
  },
  {
    "author": "Stein, Stephen E.",
    "id": "Stein-2008-NIST35",
    "note": "5,228 gas‑phase IR spectra (3,108 EPA + 2,120 NIST) in JCAMP‑DX with structure MOL‑files; normalized absorbance, intended for compound identification",
    "publisher": "Standard Reference Data Program, National Institute of Standards and Technology (NIST)",
    "title": "NIST/EPA gas‑phase infrared database (SRD 35)",
    "type": "misc",
    "url": "https://www.nist.gov/srd/nist-standard-reference-database-35",
    "year": 2008
  },
  {
    "author": "Abraham, Raymond J.; Mobli, Mehdi",
    "doi": "10.1002/9780470721803",
    "id": "Abraham-2008-ModellingNMR",
    "keywords": "NMR, prediction, proton spectra, modelling, organic chemistry",
    "note": "1st edition, hardcover, 392 pages",
    "publisher": "Wiley",
    "title": "Modelling {¹H} NMR spectra of organic compounds: Theory, applications and NMR prediction software",
    "type": "book",
    "url": "https://doi.org/10.1002/9780470721803",
    "year": 2008
  },
  {
    "abstract": "Comprehensive introduction to MR imaging physics and clinical applications.",
    "author": "Chrysikopoulos, Haris S.",
    "doi": "10.1007/978-3-540-78023-6",
    "id": "Chrysikopoulos-2008-ClinicalMRI",
    "keywords": "MRI, Clinical Imaging, Diagnostic Radiology, Physics, Tutorial",
    "note": "First Edition",
    "publisher": "Springer Berlin, Heidelberg",
    "title": "Clinical MR Imaging and Physics: A Tutorial",
    "type": "book",
    "url": "https://doi.org/10.1007/978-3-540-78023-6",
    "year": 2008
  },
  {
    "abstract": "We have developed a database containing 13C spectral information of over 6000 natural compounds, enabling rapid identification of known compounds in crude extracts and aiding structural elucidation in bioassay-guided natural product discovery workflows.",
    "author": "López-Pérez, José Luis; Therón, Roberto; del Olmo, Esther; Díaz, David",
    "doi": "10.1093/bioinformatics/btm516",
    "id": "LopezPerez-2007-NAPROC13",
    "journal": "Bioinformatics",
    "number": "23",
    "pages": "3256–3257",
    "title": "NAPROC-13: a database for the dereplication of natural product mixtures in bioassay guided protocols",
    "type": "article",
    "url": "https://doi.org/10.1093/bioinformatics/btm516",
    "volume": "23",
    "year": 2007
  },
  {
    "author": "Apodaca, Richard L.",
    "doi": "10.59350/6j774-g9p10",
    "id": "Apodaca-2007-OpenSMILES",
    "title": "Making the Case: OpenSMILES",
    "type": "online",
    "url": "https://doi.org/10.59350/6j774-g9p10",
    "year": 2007
  },
  {
    "abstract": "Fast accurate predictions of 1H NMR spectra of organic compounds play an important role in structure validation, automatic structure elucidation, or calibration of chemometric methods. The SPINUS program is a feed-forward neural network (FFNN) system developed over the last 8 years for the prediction of 1H NMR properties from the molecular structure. It was trained using a series of empirical proton descriptors. Ensembles of FFNNs were incorporated into Associative Neural Networks (ASNN), which correct a prediction on the basis of the observed errors for the k nearest neighbors in an additional memory. Here we show a procedure to estimate coupling constants with the ASNNs trained for chemical shifts—a second memory is linked consisting of coupled protons and their experimental coupling constants. An ASNN finds the pairs of coupled protons most similar to a query, and these are used to estimate coupling constants. Using a diverse general data set of 618 coupling constants, mean absolute errors of 0.6–0.8 Hz could be achieved in different experiments. A Web interface for 1H NMR full-spectrum prediction is available at http://www.dq.fct.unl.pt/spinus.",
    "author": "Binev, Yuri; Marques, Maria M. B.; Aires-de-Sousa, João",
    "doi": "10.1021/ci700172n",
    "id": "Binev-2007-NMRCoupling",
    "journal": "Journal of Chemical Information and Modeling",
    "number": "6",
    "pages": "2089–2097",
    "title": "Prediction of 1H NMR Coupling Constants with Associative Neural Networks Trained for Chemical Shifts",
    "type": "article",
    "url": "https://pubs.acs.org/doi/10.1021/ci700172n",
    "volume": "47",
    "year": 2007
  },
  {
    "author": "Kim, Keun-young; Wang, Jin",
    "doi": "10.1371/journal.pcbi.0030060.eor",
    "id": "Kim-2007-ToggleSwitch",
    "journal": "PLoS Computational Biology",
    "keywords": "Potential energy landscape, gene regulatory network, toggle switch, robustness",
    "pages": "e60",
    "publisher": "Public Library of Science (PLoS)",
    "title": "Potential Energy Landscape and Robustness of A Gene Regulatory Network: Toggle Switch",
    "type": "article",
    "url": "https://doi.org/10.1371/journal.pcbi.0030060.eor",
    "volume": "preprint",
    "year": 2007
  },
  {
    "author": "Neubeck, Alexander; Van Gool, Luc",
    "doi": "10.1109/ICPR.2006.479",
    "id": "Neubeck-2006-EfficientNMS",
    "journal": "Proceedings of the 18th International Conference on Pattern Recognition (ICPR'06)",
    "pages": "850–855",
    "publisher": "IEEE",
    "title": "Efficient Non-Maximum Suppression",
    "type": "proceedings-article",
    "url": "https://doi.org/10.1109/ICPR.2006.479",
    "volume": "3",
    "year": 2006
  },
  {
    "abstract": "TREPAN is an algorithm for extracting comprehensible rules from trained neural networks. Originally applied to bioinformatics problems, its use has now been extended to chemoinformatics datasets such as QSAR. The paper presents the method, compares its performance against C5 rule induction, and discusses its utility across several datasets including drug classification and conformational analysis.",
    "author": "Hudson, Brian D.; Whitley, David C.; Browne, Antony; Ford, Martyn G.",
    "id": "Hudson-2005-TREPAN",
    "journal": "Croatica Chemica Acta",
    "keywords": "TREPAN; rule extraction; neural networks; bioinformatics; chemoinformatics; decision trees",
    "number": "4",
    "pages": "557–561",
    "publisher": "Croatian Chemical Society",
    "title": "Extraction of comprehensible logical rules from neural networks: Application of TREPAN in bio and chemoinformatics",
    "type": "article",
    "volume": "78",
    "year": 2005
  },
  {
    "author": "Stoica, P.; Selen, Y.",
    "doi": "10.1109/msp.2004.1311138",
    "id": "Stoica-2004-ModelOrder",
    "journal": "IEEE Signal Processing Magazine",
    "number": "4",
    "pages": "36-47",
    "title": "Model-order selection: a review of information criterion rules",
    "type": "article",
    "url": "https://doi.org/10.1109/msp.2004.1311138",
    "volume": "21",
    "year": 2004
  },
  {
    "abstract": "This paper presents SFGA, a classification method that uses spline-fitting and genetic algorithms to develop structure–activity relationships from 1-D and 2-D molecular descriptors. Tested on five compound series, SFGA outperformed traditional models in most cases, with results compared against SIMCA and recursive partitioning.",
    "author": "Sutherland, Jeffrey J.; O'Brien, Lee A.; Weaver, Donald F.",
    "doi": "10.1021/ci034143r",
    "id": "Sutherland-2003-SFGA",
    "journal": "Journal of Chemical Information and Computer Sciences",
    "keywords": "QSAR, SFGA, genetic algorithm, classification models, cheminformatics, DHFR",
    "number": "6",
    "pages": "1906–1915",
    "title": "Spline-fitting with a genetic algorithm: A method for developing classification structure–activity relationships",
    "type": "article",
    "url": "https://doi.org/10.1021/ci034143r",
    "volume": "43",
    "year": 2003
  },
  {
    "abstract": "In this paper, we describe a new method for constructing minimal, deterministic, acyclic finite-state automata from a set of strings. Traditional methods consist of two phases: the first to construct a trie, the second one to minimize it. Our approach is to construct a minimal automaton in a single phase by adding new strings one by one and minimizing the resulting automaton on-the-fly. We present a general algorithm as well as a specialization that relies upon the lexicographical ordering of the input strings. Our method is fast and significantly lowers memory requirements in comparison to other methods.",
    "author": "Daciuk, Jan; Mihov, Stoyan; Watson, Bruce W.; Watson, Richard E.",
    "doi": "10.1162/089120100561601",
    "id": "Daciuk-2000-MinimalFSA",
    "journal": "Computational Linguistics",
    "number": "1",
    "pages": "3-16",
    "publisher": "MIT Press",
    "title": "Incremental construction of minimal acyclic finite-state automata",
    "type": "article",
    "url": "https://doi.org/10.1162/089120100561601",
    "volume": "26",
    "year": 2000
  },
  {
    "abstract": "We investigate the properties of a metric between two distributions, the Earth Mover's Distance (EMD), for content-based image retrieval. The EMD is based on the minimal cost that must be paid to transform one distribution into the other, in a precise sense, and was first proposed for certain vision problems by Peleg, Werman, and Rom. For image retrieval, we combine this idea with a representation scheme for distributions that is based on vector quantization. This combination leads to an image comparison framework that often accounts for perceptual similarity better than other previously proposed methods. The EMD is based on a solution to the transportation problem from linear optimization, for which efficient algorithms are available, and also allows naturally for partial matching. It is more robust than histogram matching techniques, in that it can operate on variable-length representations of the distributions that avoid quantization and other binning problems typical of histograms. When used to compare distributions with the same overall mass, the EMD is a true metric. In this paper we focus on applications to color and texture, and we compare the retrieval performance of the EMD with that of other distances.",
    "author": "Rubner, Yossi; Tomasi, Carlo; Guibas, Leonidas J.",
    "doi": "10.1023/A:1026543900054",
    "id": "Rubner-2000-EMD",
    "journal": "International Journal of Computer Vision",
    "note": "Key reference for Earth Mover's Distance in image retrieval.",
    "number": "2",
    "pages": "99-121",
    "title": "The Earth Mover's Distance as a Metric for Image Retrieval",
    "type": "article",
    "url": "https://doi.org/10.1023/A:1026543900054",
    "volume": "40",
    "year": 2000
  },
  {
    "abstract": "We continue our study of the common features present in drug molecules by looking in detail at drug side chains. Using shape description methods, we divide a database of commercially available drugs into a list of common drug side chains. On the basis of the atom pair shape descriptor (taking into account atom type, hybridization, and bond order), there are 1246 different side chains among the 5090 compounds analyzed. The average number of side chains per molecule is 4, and the average number of heavy atoms per side chain is 2. If we ignore the carbonyl side chain, then there are approximately 15 000 occurrences of side chains. Of these 15 000 approximately 11 000 are from the “top 20” group of side chains. This suggests that the diversity that side chains provide to drug molecules is quite low. We discuss ways that this work could be used to provide guidance for molecular design efforts.",
    "author": "Bemis, Guy W.; Murcko, Mark A.",
    "doi": "10.1021/jm9903996",
    "id": "Bemis-1999-SideChains",
    "journal": "Journal of Medicinal Chemistry",
    "number": "25",
    "pages": "5095–5099",
    "title": "Properties of known drugs. 2. Side chains",
    "type": "article",
    "url": "https://doi.org/10.1021/jm9903996",
    "volume": "42",
    "year": 1999
  },
  {
    "abstract": "This book provides a comprehensive account of the theory of mass transportation problems, focusing on the Monge-Kantorovich mass transportation and the Kantorovich-Rubinstein mass transshipment problems, and explores their connections to various mathematical disciplines, including functional analysis, probability theory, and mathematical economics.",
    "author": "Rachev, Svetlozar T.; Rüschendorf, Ludger",
    "doi": "10.1007/b98893",
    "id": "Rachev-1998-MTP",
    "note": "Comprehensive theory on mass transportation, emphasizing the Monge-Kantorovich problem and its applications, with a corrected DOI.",
    "publisher": "Springer Verlag",
    "title": "Mass Transportation Problems: Theory, Volume I",
    "type": "book",
    "url": "https://doi.org/10.1007/b98893",
    "year": 1998
  },
  {
    "abstract": "In order to better understand the common features present in drug molecules, we use shape description methods to analyze a database of commercially available drugs and prepare a list of common drug shapes. A useful way of organizing this structural data is to group the atoms of each drug molecule into ring, linker, framework, and side chain atoms. On the basis of the two-dimensional molecular structures (without regard to atom type, hybridization, and bond order), there are 1179 different frameworks among the 5120 compounds analyzed. However, the shapes of half of the drugs in the database are described by the 32 most frequently occurring frameworks. This suggests that the diversity of shapes in the set of known drugs is extremely low. In our second method of analysis, in which atom type, hybridization, and bond order are considered, more diversity is seen; there are 2506 different frameworks among the 5120 compounds in the database, and the most frequently occurring 42 frameworks account for only one-fourth of the drugs. We discuss the possible interpretations of these findings and the way they may be used to guide future drug discovery research.",
    "author": "Bemis, Guy W.; Murcko, Mark A.",
    "doi": "10.1021/JM9602928",
    "id": "Bemis-1996-Frameworks",
    "journal": "Journal of Medicinal Chemistry",
    "number": "15",
    "pages": "2887–2893",
    "title": "The properties of known drugs. 1. Molecular frameworks",
    "type": "article",
    "url": "https://doi.org/10.1021/JM9602928",
    "volume": "39",
    "year": 1996
  },
  {
    "abstract": "Gage introduces a simple yet effective data compression method called Byte Pair Encoding (BPE). The algorithm iteratively replaces the most frequent pair of adjacent bytes with a byte not occurring in the data. This process continues until no further compression is possible. The result is a variable-length encoding suited for lossless data compression.",
    "author": "Gage, Philip",
    "doi": "10.5555/177910.177914",
    "id": "Gage-1994-BPE",
    "journal": "The C Users Journal",
    "keywords": "byte pair encoding, BPE, data compression, lossless compression, Gage",
    "note": "Introduced Byte Pair Encoding (BPE) as a compression method based on iterative merging of frequent byte pairs.",
    "number": "2",
    "pages": "23-38",
    "publisher": "R&D Publications",
    "title": "A new algorithm for data compression",
    "type": "article",
    "url": "https://dl.acm.org/doi/10.5555/177910.177914",
    "volume": "12",
    "year": 1994
  },
  {
    "abstract": "This report presents a taxonomy of finite automata minimization algorithms. Brzozowski's elegant minimization algorithm differs from all other known minimization algorithms and is derived separately. All of the remaining algorithms depend upon computing an equivalence relation on states. The equivalence relation, the partition it induces, and its complement are defined, and some useful properties are derived. It is shown that the equivalence relation is the greatest fixed point of an equation, providing a useful characterization of the required computation. An upper bound on the number of approximation steps required to compute the fixed point is derived. Algorithms computing the equivalence relation (or the partition, or its complement) are derived systematically within the same framework. The algorithms include Hopcroft's, several algorithms from textbooks (including those by Hopcroft and Ullman, Wood, and Aho, Sethi, and Ullman), and several new algorithms or variants of existing algorithms.",
    "author": "Watson, Bruce W.",
    "id": "Watson-1993-TaxonomyFAMin",
    "journal": "Computing Science Note 93/44",
    "pages": "1-23",
    "publisher": "Eindhoven University of Technology",
    "title": "A taxonomy of finite automata minimization algorithms",
    "type": "article",
    "url": "https://research.tue.nl/files/1661456/9313451.pdf",
    "year": 1993
  },
  {
    "abstract": "The explosive accumulation of protein sequences in the wake of large-scale sequencing projects is in stark contrast to the much slower experimental determination of protein structures. Improved methods of structure prediction from the gene sequence alone are therefore needed. Here, we report a substantial increase in both the accuracy and quality of secondary-structure predictions, using a neural-network algorithm. The main improvements come from the use of multiple sequence alignments (better overall accuracy), from \"balanced training\" (better prediction of β-strands), and from \"structure context training\" (better prediction of helix and strand lengths). This method, cross-validated on seven different test sets purged of sequence similarity to learning sets, achieves a three-state prediction accuracy of 69.7%, significantly better than previous methods. In addition, the predicted structures have a more realistic distribution of helix and strand segments. The predictions may be suitable for use in practice as a first estimate of the structural type of newly sequenced proteins.",
    "author": "Rost, Burkhard; Sander, Chris",
    "id": "Rost-1993-SecondaryStructure",
    "journal": "Proceedings of the National Academy of Sciences",
    "keywords": "secondary structure prediction, sequence profiles, neural networks",
    "pages": "7558-7562",
    "title": "Improved prediction of protein secondary structure by use of sequence profiles and neural networks",
    "type": "article",
    "url": "https://www.jstor.org/stable/2362759",
    "volume": "90",
    "year": 1993
  },
  {
    "author": "Weininger, David",
    "doi": "10.1021/ci00067a005",
    "id": "Weininger-1990-SMILES3",
    "journal": "J. Chem. Inf. Comput. Sci.",
    "pages": "237–243",
    "title": "SMILES, 3. DEPICT. Graphical depiction of chemical structures",
    "type": "article",
    "url": "https://doi.org/10.1021/ci00067a005",
    "volume": "30",
    "year": 1990
  },
  {
    "author": "Weininger, David; Weininger, Arthur; Weininger, Joseph L.",
    "doi": "10.1021/ci00062a008",
    "id": "Weininger-1989-SMILES2",
    "journal": "Journal of Chemical Information and Computer Sciences",
    "number": "2",
    "pages": "97-101",
    "title": "SMILES. 2. Algorithm for generation of unique SMILES notation",
    "type": "article",
    "url": "https://doi.org/10.1021/ci00062a008",
    "volume": "29",
    "year": 1989
  },
  {
    "abstract": "A method is presented for protein secondary structure prediction based on a neural network. A training phase was used to teach the network to recognize the relation between secondary structure and amino acid sequences on a sample set of 48 proteins of known structure. On a separate test set of 14 proteins of known structure, the method achieved a maximum overall predictive accuracy of 63% for three states: helix, sheet, and coil. A numerical measure of helix and sheet tendency for each residue was obtained from the calculations. When predictions were filtered to include only the strongest 31% of predictions, the predictive accuracy rose to 79%.",
    "author": "Holley, Lynne H.; Karplus, Martin",
    "doi": "10.1073/pnas.86.1.152",
    "id": "Holley-1989-SecondaryStructure",
    "journal": "Proceedings of the National Academy of Sciences",
    "keywords": "secondary structure prediction, neural networks, helix, sheet, coil",
    "number": "1",
    "pages": "152-156",
    "title": "Protein secondary structure prediction with a neural network",
    "type": "article",
    "volume": "86",
    "year": 1989
  },
  {
    "author": "Weininger, David",
    "doi": "10.1021/ci00057a005",
    "id": "Weininger-1988-SMILES",
    "journal": "Journal of Chemical Information and Computer Sciences",
    "number": "1",
    "pages": "31-36",
    "title": "SMILES, a chemical language and information system. 1. Introduction to methodology and encoding rules",
    "type": "article",
    "url": "https://doi.org/10.1021/ci00057a005",
    "volume": "28",
    "year": 1988
  },
  {
    "abstract": "We present a new method for predicting the secondary structure of globular proteins based on non-linear neural network models. Network models learn from existing protein structures how to predict the secondary structure of local sequences of amino acids. The average success rate of our method on a testing set of proteins non-homologous with the corresponding training set was 64.3% on three types of secondary structure (α-helix, β-sheet, and coil), with correlation coefficients of C_α = 0.41, C_β = 0.31 and C_coil = 0.41. These quality indices are all higher than those of previous methods. The prediction accuracy for the first 25 residues of the N-terminal sequence was significantly better. We conclude from computational experiments on real and artificial structures that no method based solely on local information in the protein sequence is likely to produce significantly better results for non-homologous proteins. The performance of our method on homologous proteins is much better than for non-homologous proteins, but is not as good as simply assuming that homologous sequences have identical structures.",
    "author": "Qian, Ning; Sejnowski, Terrence J.",
    "doi": "10.1016/0022-2836(88)90564-5",
    "id": "Qian-1988-SecondaryStructure",
    "journal": "Journal of Molecular Biology",
    "keywords": "secondary structure prediction, neural networks, globular proteins, α-helix, β-sheet, coil",
    "pages": "865-884",
    "title": "Predicting the secondary structure of globular proteins using neural network models",
    "type": "article",
    "volume": "202",
    "year": 1988
  },
  {
    "abstract": "Assume that a random sample is drawn from a population with an unknown number of classes. This work proposes a nonparametric method to estimate the number of classes when most of the information is concentrated on the low order occupancy numbers. The percentile method (Efron, 1981, 1982) is applied to construct confidence intervals based on bootstrap distributions. Using real data sets, we also compare the proposed point and interval estimates with previously published results.",
    "author": "Chao, Anne",
    "doi": "10.2307/4615964",
    "id": "Chao-1984-Nonparametric",
    "journal": "Scandinavian Journal of Statistics",
    "keywords": "species richness, Chao1, biodiversity, nonparametric estimation",
    "number": "4",
    "pages": "265-270",
    "title": "Nonparametric estimation of the number of classes in a population",
    "type": "article",
    "url": "https://www.jstor.org/stable/4615964",
    "volume": "11",
    "year": 1984
  },
  {
    "author": "Buckingham, A. D.; Schaefer, T.; Schneider, W. G.",
    "doi": "10.1063/1.1730879",
    "id": "Buckingham-1960-SolventNMR",
    "journal": "The Journal of Chemical Physics",
    "number": "4",
    "pages": "1227-1233",
    "title": "Solvent effects in nuclear magnetic resonance spectra",
    "type": "article",
    "url": "https://doi.org/10.1063/1.1730879",
    "volume": "32",
    "year": 1960
  },
  {
    "author": "Firth, John Rupert",
    "booktitle": "Studies in Linguistic Analysis",
    "id": "Firth-1957-SynopsisLinguisticTheory",
    "note": "This work summarizes Firth's linguistic theories, emphasizing the importance of context and collocation in understanding meaning. It laid the groundwork for contextual and prosodic analysis and was later reprinted in Selected Papers of J. R. Firth 1952-59 (1968).",
    "pages": "1-32",
    "publisher": "Philological Society, Oxford",
    "title": "A Synopsis of Linguistic Theory, 1930-1955",
    "type": "incollection",
    "url": "https://languagelog.ldc.upenn.edu/myl/Firth1957.pdf",
    "year": 1957
  },
  {
    "abstract": "Zellig S. Harris introduces the concept of distributional structure, emphasizing that the meaning of linguistic elements can be inferred from their patterns of distribution within a language. This foundational work laid the groundwork for modern distributional semantics.",
    "author": "Harris, Zellig S.",
    "doi": "10.1080/00437956.1954.11659520",
    "id": "Harris-1954-DistributionalStructure",
    "journal": "Word",
    "keywords": "distributional structure, linguistics, semantics, language analysis",
    "number": "2-3",
    "pages": "146-162",
    "title": "Distributional Structure",
    "type": "article",
    "url": "https://www.tandfonline.com/doi/pdf/10.1080/00437956.1954.11659520",
    "volume": "10",
    "year": 1954
  }
]