@inproceedings{brockRanges2024,
 author = {Brock, Benjamin and Cohn, Robert and Bakshi, Suyash and Karna, Tuomas and Kim, Jeongnim and Nowak, Mateusz and Ślusarczyk, \L{}ukasz and Stefanski, Kacper and Mattson, Timothy G.},
 title = {{Distributed Ranges: A Model for Distributed Data Structures, Algorithms, and Views}},
 year = {2024},
 isbn = {9798400706103},
 publisher = {Association for Computing Machinery},
 address = {New York, NY, USA},
 url = {https://doi.org/10.1145/3650200.3656632},
 doi = {10.1145/3650200.3656632},
 abstract = {Data structures and algorithms are essential building blocks for programs, and distributed data structures, which automatically partition data across multiple memory locales, are essential to writing high-level parallel programs. While many projects have designed and implemented C++ distributed data structures and algorithms, there has not been widespread adoption of an interoperable model allowing algorithms and data structures from different libraries to work together. This paper introduces distributed ranges, which is a model for building generic data structures, views, and algorithms. A distributed range extends a C++ range, which is an iterable sequence of values, with a concept of segmentation, thus exposing how the distributed range is partitioned over multiple memory locales. Distributed data structures provide this distributed range interface, which allows them to be used with a collection of generic algorithms implemented using the distributed range interface. The modular nature of the model allows for the straightforward implementation of distributed views, which are lightweight objects that provide a lazily evaluated view of another range. Views can be composed together recursively and combined with algorithms to implement computational kernels using efficient, flexible, and high-level standard C++ primitives. We evaluate the distributed ranges model by implementing a set of standard concepts and views as well as two execution runtimes, a multi-node, MPI-based runtime and a single-process, multi-GPU runtime. We demonstrate that high-level algorithms implemented using generic, high-level distributed ranges can achieve performance competitive with highly-tuned, expert-written code.},
 booktitle = {Proceedings of the 38th ACM International Conference on Supercomputing},
 pages = {236–246},
 numpages = {11},
 location = {Kyoto, Japan},
 series = {ICS '24}
}

@inproceedings{brockGPU2024,
 author = {Brock, Benjamin and Bulu\c{c}, Ayd\i{}n and Yelick, Katherine},
 title = {{RDMA-Based Algorithms for Sparse Matrix Multiplication on GPUs}},
 year = {2024},
 isbn = {9798400706103},
 publisher = {Association for Computing Machinery},
 address = {New York, NY, USA},
 url = {https://doi.org/10.1145/3650200.3656623},
 doi = {10.1145/3650200.3656623},
 abstract = {Sparse matrix multiplication is an important kernel for large-scale graph processing and other data-intensive applications. In this paper, we implement various asynchronous, RDMA-based sparse times dense (SpMM) and sparse times sparse (SpGEMM) algorithms, evaluating their performance running in a distributed memory setting on GPUs. Our RDMA-based implementations use the NVSHMEM communication library for direct, asynchronous one-sided communication between GPUs. We compare our asynchronous implementations to state-of-the-art bulk synchronous GPU libraries as well as a CUDA-Aware MPI implementation of the SUMMA algorithm. We find that asynchronous RDMA-based implementations are able to offer favorable performance compared to bulk synchronous implementations, while also allowing for the straightforward implementation of novel work stealing algorithms.},
 booktitle = {Proceedings of the 38th ACM International Conference on Supercomputing},
 pages = {225–235},
 numpages = {11},
 location = {Kyoto, Japan},
 series = {ICS '24}
}

@phdthesis{brockthesis22,
  title    = {RDMA-Based Distributed Data Structures for Large-Scale Parallel Systems},
  school   = {University of California, Berkeley},
  author   = {Benjamin Brock},
  year     = {2022}, %other attributes omitted
}

@inproceedings{Brock:2019:BCD:3337821.3337912,
 author = {Brock, Benjamin and Bulu\c{c}, Ayd\in and Yelick, Katherine},
 title = {BCL: A Cross-Platform Distributed Data Structures Library},
 booktitle = {Proceedings of the 48th International Conference on Parallel Processing},
 series = {ICPP 2019},
 year = {2019},
 isbn = {978-1-4503-6295-5},
 location = {Kyoto, Japan},
 pages = {102:1--102:10},
 articleno = {102},
 numpages = {10},
 url = {http://doi.acm.org/10.1145/3337821.3337912},
 doi = {10.1145/3337821.3337912},
 acmid = {3337912},
 publisher = {ACM},
 address = {New York, NY, USA},
 keywords = {Distributed Data Structures, Parallel Programming Libraries, RDMA},
}

@misc{brock2019rdma,
    title={RDMA vs. RPC for Implementing Distributed Data Structures},
    author={Benjamin Brock and Yuxin Chen and Jiakun Yan and John Owens and Aydın Buluç and Katherine Yelick},
    year={2019},
    eprint={1910.02158},
    archivePrefix={arXiv},
    primaryClass={cs.DC}
}

@inproceedings{driscoll2018indigo,
  title={Indigo: A Domain-Specific Language for Fast, Portable Image Reconstruction},
  author={Driscoll, Michael and Brock, Benjamin and Ong, Frank and Tamir, Jonathan and Liu, Hsiou-Yuan and Lustig, Michael and Fox, Armando and Yelick, Katherine},
  booktitle={2018 IEEE International Parallel and Distributed Processing Symposium (IPDPS)},
  pages={495--504},
  year={2018},
  organization={IEEE}
}

@article{brock2015explicit,
  title={Explicit integration with GPU acceleration for large kinetic networks},
  author={Brock, Benjamin and Belt, Andrew and Billings, Jay Jay and Guidry, Mike},
  journal={Journal of Computational Physics},
  volume={302},
  pages={591--602},
  year={2015},
  publisher={Elsevier}
}

@inproceedings{haidar2016performance,
  title={Performance analysis and acceleration of explicit integration for large kinetic networks using batched GPU computations},
  author={Haidar, Azzam and Brock, Benjamin and Tomov, Stanimire and Guidry, Michael and Billings, Jay Jay and Shyles, Daniel and Dongarra, Jack},
  booktitle={2016 IEEE High Performance Extreme Computing Conference (HPEC)},
  pages={1--7},
  year={2016},
  organization={IEEE}
}

@inproceedings{huang2019centrifuge,
  title={Centrifuge: Evaluating full-system HLS-generated heterogenous-accelerator SoCs using FPGA-Acceleration},
  author={Huang, Qijing and Yarp, Christopher and Karandikar, Sagar and Pemberton, Nathan and Brock, Benjamin and Ma, Liang and Dai, Guohao and Quitt, Robert and Asanovic, Krste and Wawrzynek, John},
  booktitle={2019 IEEE/ACM International Conference on Computer-Aided Design (ICCAD)},
  pages={1--8},
  year={2019},
  organization={IEEE}
}