#include "device_context.hh" #include "queue_context.hh" #include #include #include namespace low_latency { DeviceContext::DeviceContext(InstanceContext& parent_instance, PhysicalDeviceContext& parent_physical_device, const VkDevice& device, const bool was_antilag_requested, VkuDeviceDispatchTable&& vtable) : instance(parent_instance), physical_device(parent_physical_device), was_antilag_requested(was_antilag_requested), device(device), vtable(std::move(vtable)) { // Only create our clock if we can support creating it. if (this->physical_device.supports_required_extensions) { this->clock = std::make_unique(*this); } } DeviceContext::~DeviceContext() { this->present_queue.reset(); // We will let the destructor handle clearing here, but they should be // unique by now (ie, removed from the layer's context map). for (const auto& [queue, queue_context] : this->queues) { assert(queue_context.unique()); } } DeviceContext::Clock::Clock(const DeviceContext& context) : device(context) { this->calibrate(); } DeviceContext::Clock::~Clock() {} DeviceContext::Clock::time_point_t DeviceContext::Clock::now() { auto ts = timespec{}; if (clock_gettime(CLOCK_MONOTONIC, &ts)) { throw errno; } return time_point_t{std::chrono::seconds{ts.tv_sec} + std::chrono::nanoseconds{ts.tv_nsec}}; } void DeviceContext::Clock::calibrate() { const auto infos = std::vector{ {VK_STRUCTURE_TYPE_CALIBRATED_TIMESTAMP_INFO_EXT, nullptr, VK_TIME_DOMAIN_DEVICE_EXT}, {VK_STRUCTURE_TYPE_CALIBRATED_TIMESTAMP_INFO_EXT, nullptr, VK_TIME_DOMAIN_CLOCK_MONOTONIC_EXT}}; struct CalibratedResult { std::uint64_t device; std::uint64_t host; }; auto calibrated_result = CalibratedResult{}; THROW_NON_VKSUCCESS(device.vtable.GetCalibratedTimestampsKHR( device.device, 2, std::data(infos), &calibrated_result.device, &this->error_bound)); this->device_ticks = calibrated_result.device; this->host_ns = calibrated_result.host; } DeviceContext::Clock::time_point_t DeviceContext::Clock::ticks_to_time(const std::uint64_t& ticks) const { const auto& pd = device.physical_device.properties; const auto ns_tick = static_cast(pd->limits.timestampPeriod); const auto diff = [&]() -> auto { auto a = this->device_ticks; auto b = ticks; const auto is_negative = a > b; if (is_negative) { std::swap(a, b); } const auto abs_diff = b - a; assert(abs_diff <= std::numeric_limits::max()); const auto signed_abs_diff = static_cast(abs_diff); return is_negative ? -signed_abs_diff : signed_abs_diff; }(); const auto diff_nsec = static_cast(static_cast(diff) * ns_tick + 0.5); const auto delta = std::chrono::nanoseconds( this->host_ns + static_cast(diff_nsec)); return time_point_t{delta}; } void DeviceContext::sleep_in_input() { // Present hasn't happened yet, we don't know what queue to attack. if (!this->present_queue) { return; } const auto& frames = this->present_queue->in_flight_frames; // No frame here means we're behind the GPU and do not need to delay. // If anything we should speed up... if (!std::size(frames)) { return; } // If we're here, that means that there might be an outstanding frame that's // sitting on our present_queue which hasn't yet completed, so we need to // stall until it's finished. const auto& last_frame = frames.back(); assert(std::size(last_frame.submissions)); const auto& last_frame_submission = last_frame.submissions.back(); last_frame_submission->end_handle->get_time_spinlock(); // From our sleep in present implementation, just spinning until // the previous frame has completed did not work well. This was because // there was a delay between presentation and when new work was given // to the GPU. If we stalled the CPU without trying to account for this, we // would get huge frame drops, loss of throughput, and the GPU would even // clock down. So naturally I am concerned about this approach, but it seems // to perform well so far in my own testing and is just beautifully elegant. } void DeviceContext::notify_antilag_update(const VkAntiLagDataAMD& data) { this->antilag_mode = data.mode; this->antilag_fps = data.maxFPS; // TODO // This might not be provided (probably just to set some settings?). if (!data.pPresentationInfo) { return; } // Only care about the input stage for now. if (data.pPresentationInfo->stage != VK_ANTI_LAG_STAGE_INPUT_AMD) { return; } if (this->antilag_mode != VK_ANTI_LAG_MODE_ON_AMD) { return; } this->sleep_in_input(); } void DeviceContext::notify_queue_present(const QueueContext& queue) { assert(this->queues.contains(queue.queue)); this->present_queue = this->queues[queue.queue]; } } // namespace low_latency