@inproceedings{NEURIPS2021_51f15efd, author = {Jaszczur, Sebastian and Chowdhery, Aakanksha and Mohiuddin, Afroz and KAISER, LUKASZ and Gajewski, Wojciech and Michalewski, Henryk and Kanerva, Jonni}, booktitle = {Advances in Neural Information Processing Systems}, editor = {M. Ranzato and A. Beygelzimer and Y. Dauphin and P.S. Liang and J. Wortman Vaughan}, pages = {9895--9907}, publisher = {Curran Associates, Inc.}, title = {Sparse is Enough in Scaling Transformers}, url = {https://proceedings.neurips.cc/paper_files/paper/2021/file/51f15efdd170e6043fa02a74882f0470-Paper.pdf}, volume = {34}, year = {2021} }