@article{hladky-2022-QS,
  title =      "QuadStream: A Quad-Based Scene Streaming Architecture for
               Novel Viewpoint Reconstruction",
  author =     "Jozef Hladky and Michael Stengel and Nicholas Vining and
               Bernhard Kerbl and Hans-Peter Seidel and Markus Steinberger",
  year =       "2022",
  abstract =   "Cloud rendering is attractive when targeting thin client
               devices such as phones or VR/AR headsets, or any situation
               where a high-end GPU is not available due to thermal or
               power constraints. However, it introduces the challenge of
               streaming rendered data over a network in a manner that is
               robust to latency and potential dropouts. Current approaches
               range from streaming transmitted video and correcting it on
               the client---which fails in the presence of disocclusion
               events---to solutions where the server sends geometry and
               all rendering is performed on the client. To balance the
               competing goals of disocclusion robustness and minimal
               client workload, we introduce QuadStream, a new streaming
               technique that reduces motion-to-photon latency by allowing
               clients to render novel views on the fly and is robust
               against disocclusions. Our key idea is to transmit an
               approximate geometric scene representation to the client
               which is independent of the source geometry and can render
               both the current view frame and nearby adjacent views.
               Motivated by traditional macroblock approaches to video
               codec design, we decompose the scene seen from positions in
               a view cell into a series of view-aligned quads from
               multiple views, or QuadProxies. By operating on a rasterized
               G-Buffer, our approach is independent of the representation
               used for the scene itself. Our technical contributions are
               an efficient parallel quad generation, merging, and packing
               strategy for proxy views that cover potential client
               movement in a scene; a packing and encoding strategy
               allowing masked quads with depth information to be
               transmitted as a frame coherent stream; and an efficient
               rendering approach that takes advantage of modern hardware
               capabilities to turn our QuadStream representation into
               complete novel views on thin clients. According to our
               experiments, our approach achieves superior quality compared
               both to streaming methods that rely on simple video data and
               to geometry-based streaming.",
  month =      dec,
  journal =    "ACM Transactions on Graphics",
  volume =     "41",
  number =     "6",
  issn =       "1557-7368",
  publisher =  "ASSOC COMPUTING MACHINERY",
  keywords =   "streaming, real-time rendering, virtual reality",
  URL =        "https://www.cg.tuwien.ac.at/research/publications/2022/hladky-2022-QS/",
}

@article{SCHUETZ-2022-PCC,
  title =      "Software Rasterization of 2 Billion Points in Real Time",
  author =     "Markus Sch\"{u}tz and Bernhard Kerbl and Michael Wimmer",
  year =       "2022",
  abstract =   "We propose a software rasterization pipeline for point
               clouds that is capable of brute-force rendering up to two
               billion points in real time (60fps). Improvements over the
               state of the art are achieved by batching points in a way
               that a number of batch-level optimizations can be computed
               before rasterizing the points within the same rendering
               pass. These optimizations include frustum culling,
               level-of-detail rendering, and choosing the appropriate
               coordinate precision for a given batch of points directly
               within a compute workgroup. Adaptive coordinate precision,
               in conjunction with visibility buffers, reduces the number
               of loaded bytes for the majority of points down to 4, thus
               making our approach several times faster than the
               bandwidth-limited state of the art. Furthermore, support for
               LOD rendering makes our software-rasterization approach
               suitable for rendering arbitrarily large point clouds, and
               to meet the increased performance demands of virtual reality
               rendering.  ",
  month =      jul,
  journal =    "Proceedings of the ACM on Computer Graphics and Interactive
               Techniques",
  volume =     "5",
  number =     "3",
  issn =       "2577-6193",
  doi =        "10.1145/3543863",
  pages =      "17",
  publisher =  "Association for Computing Machinery (ACM)",
  pages =      "1--17",
  keywords =   "point-based rendering",
  URL =        "https://www.cg.tuwien.ac.at/research/publications/2022/SCHUETZ-2022-PCC/",
}

@article{cardoso-2022-rtpercept,
  title =      "Training and Predicting Visual Error for Real-Time
               Applications",
  author =     "Joao Afonso Cardoso and Bernhard Kerbl and Lei Yang and Yury
               Uralsky and Michael Wimmer",
  year =       "2022",
  abstract =   "Visual error metrics play a fundamental role in the
               quantification of perceived image similarity. Most recently,
               use cases for them in real-time applications have emerged,
               such as content-adaptive shading and shading reuse to
               increase performance and improve efficiency. A wide range of
               different metrics has been established, with the most
               sophisticated being capable of capturing the perceptual
               characteristics of the human visual system. However, their
               complexity, computational expense, and reliance on reference
               images to compare against prevent their generalized use in
               real-time, restricting such applications to using only the
               simplest available metrics.  In this work, we explore the
               abilities of convolutional neural networks to predict a
               variety of visual metrics without requiring either reference
               or rendered images. Specifically, we train and deploy a
               neural network to estimate the visual error resulting from
               reusing shading or using reduced shading rates. The
               resulting models account for 70%--90% of the variance while
               achieving up to an order of magnitude faster computation
               times. Our solution combines image-space information that is
               readily available in most state-of-the-art deferred shading
               pipelines with reprojection from previous frames to enable
               an adequate estimate of visual errors, even in previously
               unseen regions. We describe a suitable convolutional network
               architecture and considerations for data preparation for
               training. We demonstrate the capability of our network to
               predict complex error metrics at interactive rates in a
               real-time application that implements content-adaptive
               shading in a deferred pipeline. Depending on the portion of
               unseen image regions, our approach can achieve up to 2x
               performance compared to state-of-the-art methods.",
  month =      may,
  journal =    "Proceedings of the ACM on Computer Graphics and Interactive
               Techniques",
  volume =     "5",
  number =     "1",
  issn =       "2577-6193",
  doi =        "10.1145/3522625",
  pages =      "17",
  publisher =  "Association for Computing Machinery",
  pages =      "1--17",
  keywords =   "perceptual error, variable rate shading, real-time",
  URL =        "https://www.cg.tuwien.ac.at/research/publications/2022/cardoso-2022-rtpercept/",
}

@inproceedings{celarek-2022-gmcn,
  title =      "Gaussian Mixture Convolution Networks",
  author =     "Adam Celarek and Pedro Hermosilla-Casajus and Bernhard Kerbl
               and Timo Ropinski and Michael Wimmer",
  year =       "2022",
  abstract =   "This paper proposes a novel method for deep learning based
               on the analytical convolution of multidimensional Gaussian
               mixtures. In contrast to tensors, these do not suffer from
               the curse of dimensionality and allow for a compact
               representation, as data is only stored where details exist.
               Convolution kernels and data are Gaussian mixtures with
               unconstrained weights, positions, and covariance matrices.
               Similar to discrete convolutional networks, each convolution
               step produces several feature channels, represented by
               independent Gaussian mixtures. Since traditional transfer
               functions like ReLUs do not produce Gaussian mixtures, we
               propose using a fitting of these functions instead. This
               fitting step also acts as a pooling layer if the number of
               Gaussian components is reduced appropriately. We demonstrate
               that networks based on this architecture reach competitive
               accuracy on Gaussian mixtures fitted to the MNIST and
               ModelNet data sets.",
  month =      apr,
  publisher =  "OpenReview.org",
  event =      "ICLR | 2022",
  booktitle =  "The Tenth International Conference on Learning
               Representations (ICLR 2022)",
  pages =      "1--23",
  URL =        "https://www.cg.tuwien.ac.at/research/publications/2022/celarek-2022-gmcn/",
}

@inproceedings{unterguggenberger-2022-vulkan,
  title =      "The Road to Vulkan: Teaching Modern Low-Level APIs in
               Introductory Graphics Courses",
  author =     "Johannes Unterguggenberger and Bernhard Kerbl and Michael
               Wimmer",
  year =       "2022",
  abstract =   "For over two decades, the OpenGL API provided users with the
               means for implementing versatile, feature-rich, and portable
               real-time graphics applications. Consequently, it has been
               widely adopted by practitioners and educators alike and is
               deeply ingrained in many curricula that teach real-time
               graphics for higher education. Over the years, the
               architecture of graphics processing units (GPUs)
               incrementally diverged from OpenGL's conceptual design. The
               more recently introduced Vulkan API provides a more modern,
               fine-grained approach for interfacing with the GPU. Various
               properties of this API and overall trends suggest that
               Vulkan could soon replace OpenGL in many areas. Hence, it
               stands to reason that educators who have their students'
               best interests at heart should provide them with
               corresponding lecture material. However, Vulkan is
               notoriously verbose and rather challenging for first-time
               users, thus transitioning to this new API bears a
               considerable risk of failing to achieve expected teaching
               goals. In this paper, we document our experiences after
               teaching Vulkan in an introductory graphics course
               side-by-side with conventional OpenGL. A final survey
               enables us to draw conclusions about perceived workload,
               difficulty, and students' acceptance of either approach and
               identify suitable conditions and recommendations for
               teaching Vulkan to undergraduate students.",
  month =      apr,
  isbn =       "978-3-03868-170-0",
  publisher =  "The Eurographics Association",
  location =   "Reims",
  issn =       "1017-4656",
  event =      "Eurographics 2022",
  doi =        "10.2312/eged.20221043",
  booktitle =  "Eurographics 2022 - Education Papers",
  pages =      "9",
  pages =      "31--39",
  keywords =   "vulkan, gpu, opengl",
  URL =        "https://www.cg.tuwien.ac.at/research/publications/2022/unterguggenberger-2022-vulkan/",
}

@inproceedings{kerbl-2022-trienc,
  title =      "An Improved Triangle Encoding Scheme for Cached Tessellation",
  author =     "Bernhard Kerbl and Linus Horvath and Daniel Cornel and
               Michael Wimmer",
  year =       "2022",
  abstract =   "With the recent advances in real-time rendering that were
               achieved by embracing software rasterization, the interest
               in alternative solutions for other fixed-function pipeline
               stages rises. In this paper, we revisit a recently presented
               software approach for cached tessellation, which compactly
               encodes and stores triangles in GPU memory. While the
               proposed technique is both efficient and versatile, we show
               that the original encoding is suboptimal and provide an
               alternative scheme that acts as a drop-in replacement. As
               shown in our evaluation, the proposed modifications can
               yield performance gains of 40\% and more.",
  month =      apr,
  isbn =       "978-3-03868-169-4",
  location =   "Reims",
  issn =       "1017-4656",
  event =      "Eurographics 2022",
  editor =     "Pelechano, Nuria and Vanderhaeghe, David",
  doi =        "10.2312/egs.20221031",
  booktitle =  "Eurographics 2022 - Short Papers",
  pages =      "1--4",
  keywords =   "gpu, real-time, tessellation",
  URL =        "https://www.cg.tuwien.ac.at/research/publications/2022/kerbl-2022-trienc/",
}

@inproceedings{kerbl-2022-cuda,
  title =      "CUDA and Applications to Task-based Programming",
  author =     "Bernhard Kerbl and Michael  Kenzel and Martin Winter and
               Markus Steinberger",
  year =       "2022",
  abstract =   "To provide a profound understanding of how CUDA applications
               can achieve peak performance, the first two parts of this
               tutorial outline the modern CUDA architecture. Following a
               basic introduction, we expose how language features are
               linked to---and constrained by---the underlying physical
               hardware components. Furthermore, we describe common
               applications for massively parallel programming, offer a
               detailed breakdown of potential issues, and list ways to
               mitigate performance impacts. An exemplary analysis of PTX
               and SASS snippets illustrates how code patterns in CUDA are
               mapped to actual hardware instructions.  In parts 3 and 4,
               we focus on novel features that were enabled by the arrival
               of CUDA 10+ toolkits and the Volta+ architectures, such as
               ITS, tensor cores, and the graph API. In addition to basic
               use case demonstrations, we outline our own experiences with
               these capabilities and their potential performance benefits.
               We also discuss how long-standing best practices are
               affected by these changes and describe common caveats for
               dealing with legacy code on recent GPU models. We show how
               these considerations can be implemented in practice by
               presenting state-of-the-art research into task-based GPU
               scheduling, and how the dynamic adjustment of thread roles
               and group configurations can significantly increase
               performance.",
  month =      apr,
  booktitle =  "Eurographics 2022 - Tutorials",
  editor =     "Stefanie Hahmann and Gustavo Patow",
  location =   "Reims",
  publisher =  "The Eurographics Association",
  keywords =   "Parallel Programming, GPU",
  URL =        "https://www.cg.tuwien.ac.at/research/publications/2022/kerbl-2022-cuda/",
}

@inproceedings{murturi_PGG,
  title =      "On Provisioning Procedural Geometry Workloads on Edge
               Architectures",
  author =     "Ilir Murturi and Chao Jia and Bernhard Kerbl and Michael
               Wimmer and Schahram Dustdar and Christos Tsigkanos",
  year =       "2021",
  abstract =   "Contemporary applications such as those within Augmented or
               Virtual Reality (AR/VR) pose challenges for software
               architectures supporting them, which have to adhere to
               stringent latency, data transmission, and performance
               requirements. This manifests in processing 3D models, whose
               3D contents are increasingly generated procedurally rather
               than explicitly, resulting in computational workloads (i.e.,
               perceived as Procedural Geometry Workloads) with particular
               characteristics and resource requirements. Traditionally,
               executing such workloads takes place in resource-rich
               environments such as the cloud. However, the massive amount
               of data transfer, heterogeneous devices, and networks
               involved affect latency, which in turn causes low-quality
               visualization in user-facing applications (e.g., AR/VR). To
               overcome such challenges, processing elements available
               close to end users can be leveraged to generate 3D models
               instead, and as such the edge emerges as a central
               architectural entity. This paper describes such procedural
               geometry workloads, their particular characteristics, and
               challenges to execute them on heterogeneous devices.
               Furthermore, we propose an architecture capable of
               provisioning procedural geometry workloads in edge
               scenarios.",
  month =      oct,
  isbn =       "978-989-758-536-4",
  publisher =  "SciTePress",
  organization = "INSTICC",
  event =      "17th International Conference on Web Information Systems and
               Technologies - WEBIST",
  editor =     " Francisco Dom\'{i}nguez Mayo, Massimo Marchiori and Joaquim
               Filipe",
  doi =        "10.5220/0010687800003058",
  booktitle =  "Proceedings of the 17th International Conference on Web
               Information Systems and Technologies - WEBIST",
  pages =      "6",
  pages =      "354--359",
  keywords =   "distributed systems, procedural geometry, rendering",
  URL =        "https://www.cg.tuwien.ac.at/research/publications/2021/murturi_PGG/",
}

@inproceedings{roth_vdi,
  title =      "View-Dependent Impostors for Architectural Shape Grammars",
  author =     "Chao Jia and Moritz Roth and Bernhard Kerbl and Michael
               Wimmer",
  year =       "2021",
  abstract =   "Procedural generation has become a key component in
               satisfying a growing demand for ever-larger, highly detailed
               geometry in realistic, open-world games and simulations. In
               this paper, we present our work towards a new
               level-of-detail mechanism for procedural geometry shape
               grammars. Our approach automatically identifies and adds
               suitable surrogate rules to a shape grammar's derivation
               tree. Opportunities for surrogates are detected in a
               dedicated pre-processing stage. Where suitable, textured
               impostors are then used for rendering based on the current
               viewpoint at runtime. Our proposed methods generate
               simplified geometry with superior visual quality to the
               state-of-the-art and roughly the same rendering performance.",
  month =      oct,
  isbn =       "978-3-03868-162-5",
  publisher =  "Eurographics Association",
  organization = "The Eurographics Association",
  location =   "online",
  event =      "Pacific Graphics 2021",
  editor =     "Lee, Sung-Hee and Zollmann, Stefanie and Okabe, Makoto and
               W\"{u}nsche, Burkhard",
  doi =        "10.2312/pg.20211390",
  booktitle =  "Pacific Graphics Short Papers, Posters, and Work-in-Progress
               Papers",
  pages =      "2",
  pages =      "63--64",
  keywords =   "procedural geometry, real-time, GPU",
  URL =        "https://www.cg.tuwien.ac.at/research/publications/2021/roth_vdi/",
}

@inproceedings{stappen_SteFAS,
  title =      "Temporally Stable Content-Adaptive and Spatio-Temporal
               Shading Rate Assignment for Real-Time Applications",
  author =     "Stefan Stappen and Johannes Unterguggenberger and Bernhard
               Kerbl and Michael Wimmer",
  year =       "2021",
  abstract =   "We propose two novel methods to improve the efficiency and
               quality of real-time rendering applications: Texel
               differential-based content-adaptive shading (TDCAS) and
               spatio-temporally filtered adaptive shading (STeFAS).
               Utilizing Variable Rate Shading (VRS)-a hardware feature
               introduced with NVIDIA's Turing micro-architecture-and
               properties derived during rendering or Temporal
               Anti-Aliasing (TAA), our techniques adapt the resolution to
               improve the performance and quality of real-time
               applications. VRS enables different shading resolution for
               different regions of the screen during a single render pass.
               In contrast to other techniques, TDCAS and STeFAS have very
               little overhead for computing the shading rate. STeFAS
               enables up to 4x higher rendering resolutions for similar
               frame rates, or a performance increase of 4× at the same
               resolution.",
  month =      oct,
  isbn =       "978-3-03868-162-5",
  publisher =  "Eurographics Association",
  organization = "The Eurographics Association",
  location =   "online",
  event =      "Pacific Graphics 2021",
  editor =     "Lee, Sung-Hee and Zollmann, Stefanie and Okabe, Makoto and
               W\"{u}nsche, Burkhard",
  doi =        "10.2312/pg.20211391",
  booktitle =  "Pacific Graphics Short Papers, Posters, and Work-in-Progress
               Papers",
  pages =      "2",
  pages =      "65--66",
  keywords =   "variable rate shading, temporal antialiasing",
  URL =        "https://www.cg.tuwien.ac.at/research/publications/2021/stappen_SteFAS/",
}

@article{unterguggenberger-2021-msh,
  title =      "Conservative Meshlet Bounds for Robust Culling of Skinned
               Meshes",
  author =     "Johannes Unterguggenberger and Bernhard Kerbl and Jakob
               Pernsteiner and Michael Wimmer",
  year =       "2021",
  abstract =   "Following recent advances in GPU hardware development and
               newly introduced rendering pipeline extensions, the
               segmentation of input geometry into small geometry
               clusters-so-called meshlets-has emerged as an important
               practice for efficient rendering of complex 3D models.
               Meshlets can be processed efficiently using mesh shaders on
               modern graphics processing units, in order to achieve
               streamlined geometry processing in just two tightly coupled
               shader stages that allow for dynamic workload manipulation
               in-between. The additional granularity layer between entire
               models and individual triangles enables new opportunities
               for fine-grained visibility culling methods. However, in
               contrast to static models, view frustum and backface culling
               on a per-meshlet basis for skinned, animated models are
               difficult to achieve while respecting the conservative
               spatio-temporal bounds that are required for robust
               rendering results. In this paper, we describe a solution for
               computing and exploiting relevant conservative bounds for
               culling meshlets of models that are animated using linear
               blend skinning. By enabling visibility culling for animated
               meshlets, our approach can help to improve rendering
               performance and alleviate bottlenecks in the notoriously
               performanceand memory-intensive skeletal animation pipelines
               of modern real-time graphics applications.",
  month =      oct,
  journal =    "Computer Graphics Forum",
  volume =     "40",
  number =     "7",
  issn =       "1467-8659",
  doi =        "10.1111/cgf.14401",
  booktitle =  "Computer Graphics Forum",
  pages =      "13",
  publisher =  "Eurographics Association",
  pages =      "57--69",
  keywords =   "real-time rendering, meshlet, mesh shader, task shader, view
               frustum culling, backface culling, Vulkan, vertex skinning,
               animation, conservative bounds, bounding boxes, Rodrigues'
               rotation formula, spatio-temporal bounds",
  URL =        "https://www.cg.tuwien.ac.at/research/publications/2021/unterguggenberger-2021-msh/",
}

@talk{kerbl_2021_hdg,
  title =      "Providing Highly Detailed Geometry for Cloud and Edge
               Real-Time Rendering",
  author =     "Bernhard Kerbl",
  year =       "2021",
  abstract =   "Mesh shading was recently introduced as a topical GPU
               feature in the NVIDIA Turing and AMD RDNA2 GPU
               architectures, offering an alternative pathway for executing
               the transformation, generation and augmentation of geometry
               for hardware rasterization. Future trends in game
               development will rely on mesh shading and “meshlets”,
               using highly detailed meshes with deep level of detail
               hierarchies. Particularly powerful applications of meshlets
               include arbtirary culling and subdivision methods.
               Furthermore, advanced pre-computation include visibility and
               lighting information that can be stored on a per-meshlet
               basis, thus promoting the compression of attributes through
               quantization and the acceleration of computations via
               hierarchical processing.  Although meshlets can be comprised
               from arbitrary assemblages of primitives, their benefits are
               highest when meshlet formation is done in a way that already
               takes the usecase into account. Individual formation
               procedures can be defined in order to achieve specific
               goals. As an example, we may generate meshlets that are
               optimized for global illumination techniques, by minimizing
               their curvature and variance in material coefficients.
               Incoming light can then be ray-traced and cached per
               meshlet, along with view-dependent variance encoded in a
               discretized data structure. More uniform meshlets thus
               require less data transferred for accurately approximating
               their global illumination, reducing the consumption of
               critical memory bandwidth. We may also partition entire
               scenes into meshlets that foster fast visibility culling for
               large groups of primitives, without transforming even a
               single vertex. In fact, meshlet formation policies can
               leverage arbitrary attributes, such as the distribution of
               UV coordinates, ambient occlusion or mesh topology in order
               to optimize them according to desired runtime criteria.
               Cloud gaming offers a unique opportunity for leveraging this
               technology at a larger scale: dedicated data storages and
               servers can maintain multiple copies of complex triangle
               meshes, each partitioned by a particular meshlet formation
               policy. A live monitor can react to a specific bottleneck by
               dynamically switching meshlets to best accommodate the
               current GPU resource requirements. In this talk, we will
               present the various possibilities for real-time rendering to
               benefit from mesh shading by means of optimized meshlet
               formation procedures.",
  month =      jul,
  event =      "InnovWave 2021",
  location =   "online",
  keywords =   "cloud, real-time, rendering",
  URL =        "https://www.cg.tuwien.ac.at/research/publications/2021/kerbl_2021_hdg/",
}

@article{SCHUETZ-2021-PCC,
  title =      "Rendering Point Clouds with Compute Shaders and Vertex Order
               Optimization",
  author =     "Markus Sch\"{u}tz and Bernhard Kerbl and Michael Wimmer",
  year =       "2021",
  abstract =   "While commodity GPUs provide a continuously growing range of
               features and sophisticated methods for accelerating compute
               jobs, many state-of-the-art solutions for point cloud
               rendering still rely on the provided point primitives
               (GL_POINTS, POINTLIST, ...) of graphics APIs for image
               synthesis. In this paper, we present several compute-based
               point cloud rendering approaches that outperform the
               hardware pipeline by up to an order of magnitude and achieve
               significantly better frame times than previous compute-based
               methods. Beyond basic closest-point rendering, we also
               introduce a fast, high-quality variant to reduce aliasing.
               We present and evaluate several variants of our proposed
               methods with different flavors of optimization, in order to
               ensure their applicability and achieve optimal performance
               on a range of platforms and architectures with varying
               support for novel GPU hardware features. During our
               experiments, the observed peak performance was reached
               rendering 796 million points (12.7GB) at rates of 62 to 64
               frames per second (50 billion points per second, 802GB/s) on
               an RTX 3090 without the use of level-of-detail structures. 
               We further introduce an optimized vertex order for point
               clouds to boost the efficiency of GL_POINTS by a factor of
               5x in cases where hardware rendering is compulsory. We
               compare different orderings and show that Morton sorted
               buffers are faster for some viewpoints, while shuffled
               vertex buffers are faster in others. In contrast, combining
               both approaches by first sorting according to Morton-code
               and shuffling the resulting sequence in batches of 128
               points leads to a vertex buffer layout with high rendering
               performance and low sensitivity to viewpoint changes.  ",
  month =      jul,
  journal =    "Computer Graphics Forum",
  volume =     "40",
  number =     "4",
  issn =       "1467-8659",
  doi =        "10.1111/cgf.14345",
  booktitle =  "techreport",
  pages =      "12",
  publisher =  "Eurographics Association",
  pages =      "115--126",
  keywords =   "point-based rendering, compute shader, real-time rendering",
  URL =        "https://www.cg.tuwien.ac.at/research/publications/2021/SCHUETZ-2021-PCC/",
}

@inproceedings{kenzel_michael_2021_cuda,
  title =      "CUDA and Applications to Task-based Programming",
  author =     "Michael  Kenzel and Bernhard Kerbl and Martin Winter and
               Markus Steinberger",
  year =       "2021",
  abstract =   "To provide a profound understanding of how CUDA applications
               can achieve peak performance, the first two parts of this
               tutorial outline the modern CUDA architecture. Following a
               basic introduction, we expose how language features are
               linked to---and constrained by---the underlying physical
               hardware components. Furthermore, we describe common
               applications for massively parallel programming, offer a
               detailed breakdown of potential issues, and list ways to
               mitigate performance impacts. An exemplary analysis of PTX
               and SASS snippets illustrates how code patterns in CUDA are
               mapped to actual hardware instructions.  In parts 3 and 4,
               we focus on novel features that were enabled by the arrival
               of CUDA 10+ toolkits and the Volta+ architectures, such as
               ITS, tensor cores, and the graph API. In addition to basic
               use case demonstrations, we outline our own experiences with
               these capabilities and their potential performance benefits.
               We also discuss how long-standing best practices are
               affected by these changes and describe common caveats for
               dealing with legacy code on recent GPU models. We show how
               these considerations can be implemented in practice by
               presenting state-of-the-art research into task-based GPU
               scheduling, and how the dynamic adjustment of thread roles
               and group configurations can significantly increase
               performance.",
  month =      may,
  booktitle =  "Eurographics 2021 - Tutorials",
  doi =        "10.2312/egt.20211037",
  editor =     "Carol O'Sullivan and Dieter Schmalstieg",
  location =   "Vienna",
  publisher =  "The Eurographics Association",
  pages =      "5",
  pages =      "1--5",
  keywords =   "Parallel Programming, GPU",
  URL =        "https://www.cg.tuwien.ac.at/research/publications/2021/kenzel_michael_2021_cuda/",
}

@misc{kerbl-2020-improvencoding,
  title =      "Improved Triangle Encoding for Cached Adaptive Tessellation",
  author =     "Linus Horvath and Bernhard Kerbl and Michael Wimmer",
  year =       "2020",
  month =      jul,
  location =   "online",
  event =      "HPG 2020",
  Conference date = "Poster presented at HPG 2020 (2020-05-01--2020-06-22)",
  keywords =   "GPU, tessellation, real-time",
  URL =        "https://www.cg.tuwien.ac.at/research/publications/2020/kerbl-2020-improvencoding/",
}

@inproceedings{tatzgern-2020-sst,
  title =      "Stochastic Substitute Trees for Real-Time Global
               Illumination",
  author =     "Wolfgang Tatzgern and Benedikt Mayr and Bernhard Kerbl and
               Markus Steinberger",
  year =       "2020",
  abstract =   "With the introduction of hardware-supported ray tracing and
               deep learning for denoising, computer graphics has made a
               considerable step toward real-time global illumination. In
               this work, we present an alternative global illumination
               method: The stochastic substitute tree (SST), a hierarchical
               structure inspired by lightcuts with light probability
               distributions as inner nodes. Our approach distributes
               virtual point lights (VPLs) in every frame and efficiently
               constructs the SST over those lights by clustering according
               to Morton codes. Global illumination is approximated by
               sampling the SST and considers the BRDF at the hit location
               as well as the SST nodes’ intensities for importance
               sampling directly from inner nodes of the tree. To remove
               the introduced Monte Carlo noise, we use a recurrent
               autoencoder. In combination with temporal filtering, we
               deliver real-time global illumination for complex scenes
               with challenging light distributions.",
  month =      may,
  event =      "I3D ’20",
  booktitle =  "Symposium on Interactive 3D Graphics and Games",
  pages =      "1--9",
  URL =        "https://www.cg.tuwien.ac.at/research/publications/2020/tatzgern-2020-sst/",
}

@inproceedings{unterguggenberger-2020-fmvr,
  title =      "Fast Multi-View Rendering for Real-Time Applications",
  author =     "Johannes Unterguggenberger and Bernhard Kerbl and Markus
               Steinberger and Dieter Schmalstieg and Michael Wimmer",
  year =       "2020",
  abstract =   "Efficient rendering of multiple views can be a critical
               performance factor for real-time rendering applications.
               Generating more than one view multiplies the amount of
               rendered geometry, which can cause a huge performance
               impact. Minimizing that impact has been a target of previous
               research and GPU manufacturers, who have started to equip
               devices with dedicated acceleration units. However,
               vendor-specific acceleration is not the only option to
               increase multi-view rendering (MVR) performance. Available
               graphics API features, shader stages and optimizations can
               be exploited for improved MVR performance, while generally
               offering more versatile pipeline configurations, including
               the preservation of custom tessellation and geometry
               shaders. In this paper, we present an exhaustive evaluation
               of MVR pipelines available on modern GPUs. We provide a
               detailed analysis of previous techniques,
               hardware-accelerated MVR and propose a novel method, leading
               to the creation of an MVR catalogue. Our analyses cover
               three distinct applications to help gain clarity on overall
               MVR performance characteristics. Our interpretation of the
               observed results provides a guideline for selecting the most
               appropriate one for various use cases on different GPU
               architectures.",
  month =      may,
  isbn =       "978-3-03868-107-6",
  organization = "Eurographics",
  location =   "online",
  event =      "EGPGV 2020",
  editor =     "Frey, Steffen and Huang, Jian and Sadlo, Filip",
  doi =        "10.2312/pgv.20201071",
  booktitle =  "Eurographics Symposium on Parallel Graphics and
               Visualization",
  pages =      "13--23",
  keywords =   "Real-Time Rendering, Rasterization, Multi-View,
               OVR_multiview, Geometry Shader, Evaluation",
  URL =        "https://www.cg.tuwien.ac.at/research/publications/2020/unterguggenberger-2020-fmvr/",
}

@misc{kerbl_2019_planet_poster,
  title =      "Real-time Rendering of Procedural Planets at Arbitrary
               Altitudes",
  author =     "Florian Michelic and Michael  Kenzel and Karl Haubenwallner
               and Bernhard Kerbl and Markus Steinberger",
  year =       "2019",
  abstract =   "Focusing on real-time, high-fidelity rendering, we present a
               novel approach for combined consideration of four major
               phenomena that define the visual representation of entire
               planets: We present a simple and fast solution for a
               distortion-free generation of 3D planetary terrain,
               spherical ocean waves and efficient rendering of volumetric
               clouds along with atmospheric scattering. Our approach to
               terrain and ocean mesh generation relies on a projected,
               persistent grid that can instantaneously and smoothly adapt
               to fast-changing viewpoints. For generating planetary ocean
               surfaces, we present a wave function that creates seamless,
               evenly spaced waves across the entire planet without causing
               unsightly artifacts. We further show how to render
               volumetric clouds in combination with precomputed
               atmospheric scattering and account for their contribution to
               light transport above ground. Our method provides
               mathematically consistent approximations of cloud-atmosphere
               interactions and works for any view point and direction,
               ensuring continuous transitions in appearance as the viewer
               moves from ground to space. Among others, our approach
               supports cloud shadows, light shafts, ocean reflections, and
               earth shadows on the clouds. The sum of these effects can be
               visualized at more than 120 frames per second on current
               graphics processing units.",
  month =      may,
  note =       "Voted best poster of I3D '19",
  location =   "Montreal, Canada",
  event =      "I3D 2019",
  Conference date = "Poster presented at I3D 2019 (2019-05-21--2019-05-23)",
  keywords =   "planet, rendering",
  URL =        "https://www.cg.tuwien.ac.at/research/publications/2019/kerbl_2019_planet_poster/",
}