From 50a806ea671114c92b7905182a0a9140148415b2 Mon Sep 17 00:00:00 2001
From: Markus Wick <markus@selfnet.de>
Date: Wed, 29 Aug 2018 00:27:03 +0200
Subject: [PATCH] renderer_opengl: Implement a buffer cache.

The idea of this cache is to avoid redundant uploads. So we are going
to cache the uploaded buffers within the stream_buffer and just reuse
the old pointers.
The next step is to implement a VBO cache on GPU memory, but for now,
I want to check the overhead of the cache management. Fetching the
buffer over PCI-E should be quite fast.
---
 src/video_core/CMakeLists.txt                 |   1 +
 .../renderer_opengl/gl_buffer_cache.cpp       |  90 ++++++++++++++++
 .../renderer_opengl/gl_buffer_cache.h         |  57 ++++++++++
 .../renderer_opengl/gl_rasterizer.cpp         | 101 +++++-------------
 .../renderer_opengl/gl_rasterizer.h           |  19 ++--
 5 files changed, 182 insertions(+), 86 deletions(-)
 create mode 100644 src/video_core/renderer_opengl/gl_buffer_cache.cpp
 create mode 100644 src/video_core/renderer_opengl/gl_buffer_cache.h

diff --git a/src/video_core/CMakeLists.txt b/src/video_core/CMakeLists.txt
index aa5bc3bbee..1982b76c4d 100644
--- a/src/video_core/CMakeLists.txt
+++ b/src/video_core/CMakeLists.txt
@@ -22,6 +22,7 @@ add_library(video_core STATIC
     rasterizer_interface.h
     renderer_base.cpp
     renderer_base.h
+    renderer_opengl/gl_buffer_cache.cpp
     renderer_opengl/gl_rasterizer.cpp
     renderer_opengl/gl_rasterizer.h
     renderer_opengl/gl_rasterizer_cache.cpp
diff --git a/src/video_core/renderer_opengl/gl_buffer_cache.cpp b/src/video_core/renderer_opengl/gl_buffer_cache.cpp
new file mode 100644
index 0000000000..c85fbd3066
--- /dev/null
+++ b/src/video_core/renderer_opengl/gl_buffer_cache.cpp
@@ -0,0 +1,90 @@
+// Copyright 2018 yuzu Emulator Project
+// Licensed under GPLv2 or any later version
+// Refer to the license.txt file included.
+
+#include "common/alignment.h"
+#include "common/assert.h"
+#include "core/core.h"
+#include "core/memory.h"
+#include "video_core/renderer_opengl/gl_buffer_cache.h"
+
+namespace OpenGL {
+
+OGLBufferCache::OGLBufferCache(size_t size) : stream_buffer(GL_ARRAY_BUFFER, size) {}
+
+GLintptr OGLBufferCache::UploadMemory(Tegra::GPUVAddr gpu_addr, size_t size, size_t alignment,
+                                      bool cache) {
+    auto& memory_manager = Core::System::GetInstance().GPU().MemoryManager();
+    const boost::optional<VAddr> cpu_addr{memory_manager.GpuToCpuAddress(gpu_addr)};
+
+    // Cache management is a big overhead, so only cache entries with a given size.
+    // TODO: Figure out which size is the best for given games.
+    cache &= size >= 2048;
+
+    if (cache) {
+        auto entry = TryGet(*cpu_addr);
+        if (entry) {
+            if (entry->size >= size && entry->alignment == alignment) {
+                return entry->offset;
+            }
+            Unregister(entry);
+        }
+    }
+
+    AlignBuffer(alignment);
+    GLintptr uploaded_offset = buffer_offset;
+
+    Memory::ReadBlock(*cpu_addr, buffer_ptr, size);
+
+    buffer_ptr += size;
+    buffer_offset += size;
+
+    if (cache) {
+        auto entry = std::make_shared<CachedBufferEntry>();
+        entry->offset = uploaded_offset;
+        entry->size = size;
+        entry->alignment = alignment;
+        entry->addr = *cpu_addr;
+        Register(entry);
+    }
+
+    return uploaded_offset;
+}
+
+GLintptr OGLBufferCache::UploadHostMemory(const void* raw_pointer, size_t size, size_t alignment) {
+    AlignBuffer(alignment);
+    std::memcpy(buffer_ptr, raw_pointer, size);
+    GLintptr uploaded_offset = buffer_offset;
+
+    buffer_ptr += size;
+    buffer_offset += size;
+    return uploaded_offset;
+}
+
+void OGLBufferCache::Map(size_t max_size) {
+    bool invalidate;
+    std::tie(buffer_ptr, buffer_offset_base, invalidate) =
+        stream_buffer.Map(static_cast<GLsizeiptr>(max_size), 4);
+    buffer_offset = buffer_offset_base;
+
+    if (invalidate) {
+        InvalidateAll();
+    }
+}
+void OGLBufferCache::Unmap() {
+    stream_buffer.Unmap(buffer_offset - buffer_offset_base);
+}
+
+GLuint OGLBufferCache::GetHandle() {
+    return stream_buffer.GetHandle();
+}
+
+void OGLBufferCache::AlignBuffer(size_t alignment) {
+    // Align the offset, not the mapped pointer
+    GLintptr offset_aligned =
+        static_cast<GLintptr>(Common::AlignUp(static_cast<size_t>(buffer_offset), alignment));
+    buffer_ptr += offset_aligned - buffer_offset;
+    buffer_offset = offset_aligned;
+}
+
+} // namespace OpenGL
diff --git a/src/video_core/renderer_opengl/gl_buffer_cache.h b/src/video_core/renderer_opengl/gl_buffer_cache.h
new file mode 100644
index 0000000000..9c7ad27e65
--- /dev/null
+++ b/src/video_core/renderer_opengl/gl_buffer_cache.h
@@ -0,0 +1,57 @@
+// Copyright 2018 yuzu Emulator Project
+// Licensed under GPLv2 or any later version
+// Refer to the license.txt file included.
+
+#pragma once
+
+#include <memory>
+#include <unordered_map>
+
+#include "common/common_types.h"
+#include "video_core/rasterizer_cache.h"
+#include "video_core/renderer_opengl/gl_resource_manager.h"
+#include "video_core/renderer_opengl/gl_stream_buffer.h"
+
+namespace OpenGL {
+
+struct CachedBufferEntry final {
+    VAddr GetAddr() const {
+        return addr;
+    }
+
+    size_t GetSizeInBytes() const {
+        return size;
+    }
+
+    VAddr addr;
+    size_t size;
+    GLintptr offset;
+    size_t alignment;
+};
+
+class OGLBufferCache final : public RasterizerCache<std::shared_ptr<CachedBufferEntry>> {
+public:
+    OGLBufferCache(size_t size);
+
+    GLintptr UploadMemory(Tegra::GPUVAddr gpu_addr, size_t size, size_t alignment = 4,
+                          bool cache = true);
+
+    GLintptr UploadHostMemory(const void* raw_pointer, size_t size, size_t alignment = 4);
+
+    void Map(size_t max_size);
+    void Unmap();
+
+    GLuint GetHandle();
+
+protected:
+    void AlignBuffer(size_t alignment);
+
+private:
+    OGLStreamBuffer stream_buffer;
+
+    u8* buffer_ptr;
+    GLintptr buffer_offset;
+    GLintptr buffer_offset_base;
+};
+
+} // namespace OpenGL
diff --git a/src/video_core/renderer_opengl/gl_rasterizer.cpp b/src/video_core/renderer_opengl/gl_rasterizer.cpp
index 7ee3f2ae71..c66a18155e 100644
--- a/src/video_core/renderer_opengl/gl_rasterizer.cpp
+++ b/src/video_core/renderer_opengl/gl_rasterizer.cpp
@@ -43,7 +43,7 @@ MICROPROFILE_DEFINE(OpenGL_Blits, "OpenGL", "Blits", MP_RGB(128, 128, 192));
 MICROPROFILE_DEFINE(OpenGL_CacheManagement, "OpenGL", "Cache Mgmt", MP_RGB(100, 255, 100));
 
 RasterizerOpenGL::RasterizerOpenGL(Core::Frontend::EmuWindow& window, ScreenInfo& info)
-    : emu_window{window}, screen_info{info}, stream_buffer(GL_ARRAY_BUFFER, STREAM_BUFFER_SIZE) {
+    : emu_window{window}, screen_info{info}, buffer_cache(STREAM_BUFFER_SIZE) {
     // Create sampler objects
     for (size_t i = 0; i < texture_samplers.size(); ++i) {
         texture_samplers[i].Create();
@@ -83,14 +83,14 @@ RasterizerOpenGL::RasterizerOpenGL(Core::Frontend::EmuWindow& window, ScreenInfo
 
     hw_vao.Create();
 
-    state.draw.vertex_buffer = stream_buffer.GetHandle();
+    state.draw.vertex_buffer = buffer_cache.GetHandle();
 
     shader_program_manager = std::make_unique<GLShader::ProgramManager>();
     state.draw.shader_program = 0;
     state.draw.vertex_array = hw_vao.handle;
     state.Apply();
 
-    glBindBuffer(GL_ELEMENT_ARRAY_BUFFER, stream_buffer.GetHandle());
+    glBindBuffer(GL_ELEMENT_ARRAY_BUFFER, buffer_cache.GetHandle());
 
     glEnable(GL_BLEND);
 
@@ -101,14 +101,13 @@ RasterizerOpenGL::RasterizerOpenGL(Core::Frontend::EmuWindow& window, ScreenInfo
 
 RasterizerOpenGL::~RasterizerOpenGL() {}
 
-std::pair<u8*, GLintptr> RasterizerOpenGL::SetupVertexArrays(u8* array_ptr,
-                                                             GLintptr buffer_offset) {
+void RasterizerOpenGL::SetupVertexArrays() {
     MICROPROFILE_SCOPE(OpenGL_VAO);
     const auto& gpu = Core::System::GetInstance().GPU().Maxwell3D();
     const auto& regs = gpu.regs;
 
     state.draw.vertex_array = hw_vao.handle;
-    state.draw.vertex_buffer = stream_buffer.GetHandle();
+    state.draw.vertex_buffer = buffer_cache.GetHandle();
     state.Apply();
 
     // Upload all guest vertex arrays sequentially to our buffer
@@ -127,12 +126,10 @@ std::pair<u8*, GLintptr> RasterizerOpenGL::SetupVertexArrays(u8* array_ptr,
         ASSERT(end > start);
         u64 size = end - start + 1;
 
-        GLintptr vertex_buffer_offset;
-        std::tie(array_ptr, buffer_offset, vertex_buffer_offset) =
-            UploadMemory(array_ptr, buffer_offset, start, size);
+        GLintptr vertex_buffer_offset = buffer_cache.UploadMemory(start, size);
 
         // Bind the vertex array to the buffer at the current offset.
-        glBindVertexBuffer(index, stream_buffer.GetHandle(), vertex_buffer_offset,
+        glBindVertexBuffer(index, buffer_cache.GetHandle(), vertex_buffer_offset,
                            vertex_array.stride);
 
         if (regs.instanced_arrays.IsInstancingEnabled(index) && vertex_array.divisor != 0) {
@@ -177,11 +174,9 @@ std::pair<u8*, GLintptr> RasterizerOpenGL::SetupVertexArrays(u8* array_ptr,
         }
         glVertexAttribBinding(index, attrib.buffer);
     }
-
-    return {array_ptr, buffer_offset};
 }
 
-std::pair<u8*, GLintptr> RasterizerOpenGL::SetupShaders(u8* buffer_ptr, GLintptr buffer_offset) {
+void RasterizerOpenGL::SetupShaders() {
     MICROPROFILE_SCOPE(OpenGL_Shader);
     auto& gpu = Core::System::GetInstance().GPU().Maxwell3D();
 
@@ -199,21 +194,15 @@ std::pair<u8*, GLintptr> RasterizerOpenGL::SetupShaders(u8* buffer_ptr, GLintptr
             continue;
         }
 
-        std::tie(buffer_ptr, buffer_offset) =
-            AlignBuffer(buffer_ptr, buffer_offset, static_cast<size_t>(uniform_buffer_alignment));
-
         const size_t stage{index == 0 ? 0 : index - 1}; // Stage indices are 0 - 5
 
         GLShader::MaxwellUniformData ubo{};
         ubo.SetFromRegs(gpu.state.shader_stages[stage]);
-        std::memcpy(buffer_ptr, &ubo, sizeof(ubo));
+        GLintptr offset = buffer_cache.UploadHostMemory(
+            &ubo, sizeof(ubo), static_cast<size_t>(uniform_buffer_alignment));
 
         // Bind the buffer
-        glBindBufferRange(GL_UNIFORM_BUFFER, stage, stream_buffer.GetHandle(), buffer_offset,
-                          sizeof(ubo));
-
-        buffer_ptr += sizeof(ubo);
-        buffer_offset += sizeof(ubo);
+        glBindBufferRange(GL_UNIFORM_BUFFER, stage, buffer_cache.GetHandle(), offset, sizeof(ubo));
 
         Shader shader{shader_cache.GetStageProgram(program)};
 
@@ -234,9 +223,8 @@ std::pair<u8*, GLintptr> RasterizerOpenGL::SetupShaders(u8* buffer_ptr, GLintptr
         }
 
         // Configure the const buffers for this shader stage.
-        std::tie(buffer_ptr, buffer_offset, current_constbuffer_bindpoint) =
-            SetupConstBuffers(buffer_ptr, buffer_offset, static_cast<Maxwell::ShaderStage>(stage),
-                              shader, current_constbuffer_bindpoint);
+        current_constbuffer_bindpoint = SetupConstBuffers(static_cast<Maxwell::ShaderStage>(stage),
+                                                          shader, current_constbuffer_bindpoint);
 
         // Configure the textures for this shader stage.
         current_texture_bindpoint = SetupTextures(static_cast<Maxwell::ShaderStage>(stage), shader,
@@ -250,8 +238,6 @@ std::pair<u8*, GLintptr> RasterizerOpenGL::SetupShaders(u8* buffer_ptr, GLintptr
     }
 
     shader_program_manager->UseTrivialGeometryShader();
-
-    return {buffer_ptr, buffer_offset};
 }
 
 size_t RasterizerOpenGL::CalculateVertexArraysSize() const {
@@ -439,31 +425,6 @@ void RasterizerOpenGL::Clear() {
     glClear(clear_mask);
 }
 
-std::pair<u8*, GLintptr> RasterizerOpenGL::AlignBuffer(u8* buffer_ptr, GLintptr buffer_offset,
-                                                       size_t alignment) {
-    // Align the offset, not the mapped pointer
-    GLintptr offset_aligned =
-        static_cast<GLintptr>(Common::AlignUp(static_cast<size_t>(buffer_offset), alignment));
-    return {buffer_ptr + (offset_aligned - buffer_offset), offset_aligned};
-}
-
-std::tuple<u8*, GLintptr, GLintptr> RasterizerOpenGL::UploadMemory(u8* buffer_ptr,
-                                                                   GLintptr buffer_offset,
-                                                                   Tegra::GPUVAddr gpu_addr,
-                                                                   size_t size, size_t alignment) {
-    std::tie(buffer_ptr, buffer_offset) = AlignBuffer(buffer_ptr, buffer_offset, alignment);
-    GLintptr uploaded_offset = buffer_offset;
-
-    auto& memory_manager = Core::System::GetInstance().GPU().MemoryManager();
-    const boost::optional<VAddr> cpu_addr{memory_manager.GpuToCpuAddress(gpu_addr)};
-    Memory::ReadBlock(*cpu_addr, buffer_ptr, size);
-
-    buffer_ptr += size;
-    buffer_offset += size;
-
-    return {buffer_ptr, buffer_offset, uploaded_offset};
-}
-
 void RasterizerOpenGL::DrawArrays() {
     if (accelerate_draw == AccelDraw::Disabled)
         return;
@@ -489,7 +450,7 @@ void RasterizerOpenGL::DrawArrays() {
     const bool is_indexed = accelerate_draw == AccelDraw::Indexed;
     const u64 index_buffer_size{regs.index_array.count * regs.index_array.FormatSizeInBytes()};
 
-    state.draw.vertex_buffer = stream_buffer.GetHandle();
+    state.draw.vertex_buffer = buffer_cache.GetHandle();
     state.Apply();
 
     size_t buffer_size = CalculateVertexArraysSize();
@@ -506,25 +467,21 @@ void RasterizerOpenGL::DrawArrays() {
     // Add space for at least 18 constant buffers
     buffer_size += Maxwell::MaxConstBuffers * (MaxConstbufferSize + uniform_buffer_alignment);
 
-    u8* buffer_ptr;
-    GLintptr buffer_offset;
-    std::tie(buffer_ptr, buffer_offset, std::ignore) =
-        stream_buffer.Map(static_cast<GLsizeiptr>(buffer_size), 4);
-    u8* buffer_ptr_base = buffer_ptr;
+    buffer_cache.Map(buffer_size);
 
-    std::tie(buffer_ptr, buffer_offset) = SetupVertexArrays(buffer_ptr, buffer_offset);
+    SetupVertexArrays();
 
     // If indexed mode, copy the index buffer
     GLintptr index_buffer_offset = 0;
     if (is_indexed) {
         MICROPROFILE_SCOPE(OpenGL_Index);
-        std::tie(buffer_ptr, buffer_offset, index_buffer_offset) = UploadMemory(
-            buffer_ptr, buffer_offset, regs.index_array.StartAddress(), index_buffer_size);
+        index_buffer_offset =
+            buffer_cache.UploadMemory(regs.index_array.StartAddress(), index_buffer_size);
     }
 
-    std::tie(buffer_ptr, buffer_offset) = SetupShaders(buffer_ptr, buffer_offset);
+    SetupShaders();
 
-    stream_buffer.Unmap(buffer_ptr - buffer_ptr_base);
+    buffer_cache.Unmap();
 
     shader_program_manager->ApplyTo(state);
     state.Apply();
@@ -569,6 +526,7 @@ void RasterizerOpenGL::InvalidateRegion(VAddr addr, u64 size) {
     MICROPROFILE_SCOPE(OpenGL_CacheManagement);
     res_cache.InvalidateRegion(addr, size);
     shader_cache.InvalidateRegion(addr, size);
+    buffer_cache.InvalidateRegion(addr, size);
 }
 
 void RasterizerOpenGL::FlushAndInvalidateRegion(VAddr addr, u64 size) {
@@ -658,11 +616,8 @@ void RasterizerOpenGL::SamplerInfo::SyncWithConfig(const Tegra::Texture::TSCEntr
     }
 }
 
-std::tuple<u8*, GLintptr, u32> RasterizerOpenGL::SetupConstBuffers(u8* buffer_ptr,
-                                                                   GLintptr buffer_offset,
-                                                                   Maxwell::ShaderStage stage,
-                                                                   Shader& shader,
-                                                                   u32 current_bindpoint) {
+u32 RasterizerOpenGL::SetupConstBuffers(Maxwell::ShaderStage stage, Shader& shader,
+                                        u32 current_bindpoint) {
     MICROPROFILE_SCOPE(OpenGL_UBO);
     const auto& gpu = Core::System::GetInstance().GPU();
     const auto& maxwell3d = gpu.Maxwell3D();
@@ -699,13 +654,11 @@ std::tuple<u8*, GLintptr, u32> RasterizerOpenGL::SetupConstBuffers(u8* buffer_pt
         size = Common::AlignUp(size, sizeof(GLvec4));
         ASSERT_MSG(size <= MaxConstbufferSize, "Constbuffer too big");
 
-        GLintptr const_buffer_offset;
-        std::tie(buffer_ptr, buffer_offset, const_buffer_offset) =
-            UploadMemory(buffer_ptr, buffer_offset, buffer.address, size,
-                         static_cast<size_t>(uniform_buffer_alignment));
+        GLintptr const_buffer_offset = buffer_cache.UploadMemory(
+            buffer.address, size, static_cast<size_t>(uniform_buffer_alignment));
 
         glBindBufferRange(GL_UNIFORM_BUFFER, current_bindpoint + bindpoint,
-                          stream_buffer.GetHandle(), const_buffer_offset, size);
+                          buffer_cache.GetHandle(), const_buffer_offset, size);
 
         // Now configure the bindpoint of the buffer inside the shader
         glUniformBlockBinding(shader->GetProgramHandle(),
@@ -715,7 +668,7 @@ std::tuple<u8*, GLintptr, u32> RasterizerOpenGL::SetupConstBuffers(u8* buffer_pt
 
     state.Apply();
 
-    return {buffer_ptr, buffer_offset, current_bindpoint + static_cast<u32>(entries.size())};
+    return current_bindpoint + static_cast<u32>(entries.size());
 }
 
 u32 RasterizerOpenGL::SetupTextures(Maxwell::ShaderStage stage, Shader& shader, u32 current_unit) {
diff --git a/src/video_core/renderer_opengl/gl_rasterizer.h b/src/video_core/renderer_opengl/gl_rasterizer.h
index 30045ebff6..4c4b084b8f 100644
--- a/src/video_core/renderer_opengl/gl_rasterizer.h
+++ b/src/video_core/renderer_opengl/gl_rasterizer.h
@@ -18,7 +18,9 @@
 #include "common/common_types.h"
 #include "video_core/engines/maxwell_3d.h"
 #include "video_core/memory_manager.h"
+#include "video_core/rasterizer_cache.h"
 #include "video_core/rasterizer_interface.h"
+#include "video_core/renderer_opengl/gl_buffer_cache.h"
 #include "video_core/renderer_opengl/gl_rasterizer_cache.h"
 #include "video_core/renderer_opengl/gl_resource_manager.h"
 #include "video_core/renderer_opengl/gl_shader_cache.h"
@@ -109,9 +111,8 @@ private:
      * @param current_bindpoint The offset at which to start counting new buffer bindpoints.
      * @returns The next available bindpoint for use in the next shader stage.
      */
-    std::tuple<u8*, GLintptr, u32> SetupConstBuffers(
-        u8* buffer_ptr, GLintptr buffer_offset, Tegra::Engines::Maxwell3D::Regs::ShaderStage stage,
-        Shader& shader, u32 current_bindpoint);
+    u32 SetupConstBuffers(Tegra::Engines::Maxwell3D::Regs::ShaderStage stage, Shader& shader,
+                          u32 current_bindpoint);
 
     /*
      * Configures the current textures to use for the draw command.
@@ -173,22 +174,16 @@ private:
     std::array<SamplerInfo, GLShader::NumTextureSamplers> texture_samplers;
 
     static constexpr size_t STREAM_BUFFER_SIZE = 128 * 1024 * 1024;
-    OGLStreamBuffer stream_buffer;
+    OGLBufferCache buffer_cache;
     OGLBuffer uniform_buffer;
     OGLFramebuffer framebuffer;
     GLint uniform_buffer_alignment;
 
     size_t CalculateVertexArraysSize() const;
 
-    std::pair<u8*, GLintptr> SetupVertexArrays(u8* array_ptr, GLintptr buffer_offset);
+    void SetupVertexArrays();
 
-    std::pair<u8*, GLintptr> SetupShaders(u8* buffer_ptr, GLintptr buffer_offset);
-
-    std::pair<u8*, GLintptr> AlignBuffer(u8* buffer_ptr, GLintptr buffer_offset, size_t alignment);
-
-    std::tuple<u8*, GLintptr, GLintptr> UploadMemory(u8* buffer_ptr, GLintptr buffer_offset,
-                                                     Tegra::GPUVAddr gpu_addr, size_t size,
-                                                     size_t alignment = 4);
+    void SetupShaders();
 
     enum class AccelDraw { Disabled, Arrays, Indexed };
     AccelDraw accelerate_draw = AccelDraw::Disabled;
-- 
GitLab