Skip to content

Commit

Permalink
[Profiler] Fix bugs in the timer_create-based CPU profiler (#6229)
Browse files Browse the repository at this point in the history
## Reason for change

The implementation was not correct when introduced the first time.
* The timer id was not correctly initialized and save
* The thread registering was done too early in the process
* Missing checks (not safe to unwind)
* Save and Restore `errno`: libunwind can change `errno` value and this
can impact the application behavior

## Implementation details

* Add the missing check
* Add class to handle errno save and restore
* Fix the thread registration and the timer id initialization and
saving.
  • Loading branch information
gleocadie authored Nov 4, 2024
1 parent 64c02f7 commit 8d9670a
Show file tree
Hide file tree
Showing 5 changed files with 137 additions and 14 deletions.
Original file line number Diff line number Diff line change
Expand Up @@ -54,14 +54,7 @@ void TimerCreateCpuProfiler::RegisterThread(std::shared_ptr<ManagedThreadInfo> t
void TimerCreateCpuProfiler::UnregisterThread(std::shared_ptr<ManagedThreadInfo> threadInfo)
{
std::shared_lock lock(_registerLock);

auto timerId = threadInfo->SetTimerId(-1);

if (timerId != -1)
{
Log::Debug("Unregister timer for thread ", threadInfo->GetOsThreadId());
syscall(__NR_timer_delete, timerId);
}
UnregisterThreadImpl(threadInfo.get());
}

const char* TimerCreateCpuProfiler::GetName()
Expand Down Expand Up @@ -95,8 +88,16 @@ bool TimerCreateCpuProfiler::StartImpl()

bool TimerCreateCpuProfiler::StopImpl()
{
_pSignalManager->UnRegisterHandler();
{
std::unique_lock lock(_registerLock);

// We have to remove all timers before unregistering the handler for SIGPROF.
// Otherwise, the process will end with exit code 155 (128 + 27 => 27 being SIGPROF value)
_pManagedThreadsList->ForEach([this](ManagedThreadInfo* thread) { UnregisterThreadImpl(thread); });
}

Instance = nullptr;
_pSignalManager->UnRegisterHandler();

return true;
}
Expand All @@ -112,6 +113,46 @@ bool TimerCreateCpuProfiler::CollectStackSampleSignalHandler(int sig, siginfo_t*
return instance->Collect(ucontext);
}

// This symbol is defined in the Datadog.Linux.ApiWrapper. It allows us to check if the thread to be profiled
// contains a frame of a function that might cause a deadlock.
extern "C" unsigned long long dd_inside_wrapped_functions() __attribute__((weak));

bool TimerCreateCpuProfiler::CanCollect(void* ctx)
{
// TODO (in another PR): add metrics about reasons we could not collect
if (dd_inside_wrapped_functions != nullptr && dd_inside_wrapped_functions() != 0)
{
return false;
}

auto* context = reinterpret_cast<ucontext_t*>(ctx);
// If SIGSEGV is part of the sigmask set, it means that the thread was executing
// the SIGSEGV signal handler (or someone blocks SIGSEGV signal for this thread,
// but that less likely)
if (sigismember(&(context->uc_sigmask), SIGSEGV) == 1)
{
return false;
}

return true;
}

struct ErrnoSaveAndRestore
{
public:
ErrnoSaveAndRestore() :
_oldErrno{errno}
{
}
~ErrnoSaveAndRestore()
{
errno = _oldErrno;
}

private:
int _oldErrno;
};

bool TimerCreateCpuProfiler::Collect(void* ctx)
{
auto threadInfo = ManagedThreadInfo::CurrentThreadInfo;
Expand All @@ -121,7 +162,12 @@ bool TimerCreateCpuProfiler::Collect(void* ctx)
return false;
}

auto* context = reinterpret_cast<unw_context_t*>(ctx);
// Libunwind can overwrite the value of errno - save it beforehand and restore it at the end
ErrnoSaveAndRestore errnoScope;
if (!CanCollect(ctx))
{
return false;
}

auto callstack = _callstackProvider.Get();

Expand All @@ -131,6 +177,7 @@ bool TimerCreateCpuProfiler::Collect(void* ctx)
}

auto buffer = callstack.Data();
auto* context = reinterpret_cast<unw_context_t*>(ctx);
auto count = unw_backtrace2((void**)buffer.data(), buffer.size(), context, UNW_INIT_SIGNAL_FRAME);
callstack.SetCount(count);

Expand Down Expand Up @@ -182,10 +229,23 @@ void TimerCreateCpuProfiler::RegisterThreadImpl(ManagedThreadInfo* threadInfo)
return;
}

threadInfo->SetTimerId(timerId);

std::int32_t _interval = std::chrono::duration_cast<std::chrono::nanoseconds>(_samplingInterval).count();
struct itimerspec ts;
ts.it_interval.tv_sec = (time_t)(_interval / 1000000000);
ts.it_interval.tv_nsec = _interval % 1000000000;
ts.it_value = ts.it_interval;
syscall(__NR_timer_settime, timerId, 0, &ts, nullptr);
}

void TimerCreateCpuProfiler::UnregisterThreadImpl(ManagedThreadInfo* threadInfo)
{
auto timerId = threadInfo->SetTimerId(-1);

if (timerId != -1)
{
Log::Debug("Unregister timer for thread ", threadInfo->GetOsThreadId());
syscall(__NR_timer_delete, timerId);
}
}
Original file line number Diff line number Diff line change
Expand Up @@ -41,10 +41,12 @@ class TimerCreateCpuProfiler : public ServiceBase

private:
static bool CollectStackSampleSignalHandler(int sig, siginfo_t* info, void* ucontext);
static bool CanCollect(void* context);
static TimerCreateCpuProfiler* Instance;

bool Collect(void* ucontext);
void RegisterThreadImpl(ManagedThreadInfo* thread);
void UnregisterThreadImpl(ManagedThreadInfo* threadInfo);

bool StartImpl() override;
bool StopImpl() override;
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -1629,8 +1629,12 @@ HRESULT STDMETHODCALLTYPE CorProfilerCallback::ThreadAssignedToOSThread(ThreadID
// is the same native thread assigned to the managed thread.
ManagedThreadInfo::CurrentThreadInfo = threadInfo;

#ifdef LINUX
_pManagedThreadList->SetThreadOsInfo(managedThreadId, osThreadId, dupOsThreadHandle);

#ifdef LINUX
// This call must be made *after* we assigne the SetThreadOsInfo function call.
// Otherwise the threadInfo won't have it's OsThread field set and timer_create
// will have random behavior.
if (_pCpuProfiler != nullptr)
{
_pCpuProfiler->RegisterThread(threadInfo);
Expand Down Expand Up @@ -1666,7 +1670,6 @@ HRESULT STDMETHODCALLTYPE CorProfilerCallback::ThreadAssignedToOSThread(ThreadID
return S_OK;
}
#endif
_pManagedThreadList->SetThreadOsInfo(managedThreadId, osThreadId, dupOsThreadHandle);

return S_OK;
}
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -45,6 +45,7 @@ ManagedThreadInfo::ManagedThreadInfo(ThreadID clrThreadId, ICorProfilerInfo4* pC
_traceContext{},
#ifdef LINUX
_sharedMemoryArea{nullptr},
_timerId{-1},
#endif
_info{pCorProfilerInfo},
_blockingThreadId{0}
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -3,6 +3,8 @@
// This product includes software developed at Datadog (https://www.datadoghq.com/). Copyright 2022 Datadog, Inc.
// </copyright>

using System.IO;
using System.Linq;
using Datadog.Profiler.IntegrationTests.Helpers;
using FluentAssertions;
using Xunit;
Expand All @@ -22,6 +24,32 @@ public TimerCreateCpuProfilerTest(ITestOutputHelper output)
_output = output;
}

[TestAppFact("Samples.Computer01")]
public void CheckLogForError(string appName, string framework, string appAssembly)
{
var runner = new TestApplicationRunner(appName, framework, appAssembly, _output, commandLine: CmdLine);
// disable default profilers except CPU
EnvironmentHelper.DisableDefaultProfilers(runner);
runner.Environment.SetVariable(EnvironmentVariables.CpuProfilerEnabled, "1");
runner.Environment.SetVariable(EnvironmentVariables.CpuProfilerType, "TimerCreate");

using var agent = MockDatadogAgent.CreateHttpAgent(_output);

runner.Run(agent);

var logFile = Directory.GetFiles(runner.Environment.LogDir)
.Single(f => Path.GetFileName(f).StartsWith("DD-DotNet-Profiler-Native-"));

var logLines = File.ReadLines(logFile);

logLines.Should().ContainMatch("*timer_create Cpu profiler is enabled*");

logLines.Should().NotContainMatch("*Call to timer_create failed for thread 0*");
logLines.Should().NotContainMatch("*Timer was already created for thread 0*");
logLines.Should().NotContainMatch("*Call to timer_create failed for thread 0*");
SamplesHelper.GetSamples(runner.Environment.PprofDir).Should().NotBeEmpty("No samples were found");
}

[TestAppFact("Samples.Computer01")]
public void CheckCpuSamples(string appName, string framework, string appAssembly)
{
Expand All @@ -38,7 +66,9 @@ public void CheckCpuSamples(string appName, string framework, string appAssembly
runner.Run(agent);

// only cpu profiler enabled so should see 1 value per sample and
foreach (var (_, _, values) in SamplesHelper.GetSamples(runner.Environment.PprofDir))
var samples = SamplesHelper.GetSamples(runner.Environment.PprofDir);
samples.Should().NotBeEmpty();
foreach (var (_, _, values) in samples)
{
values.Length.Should().Be(1);
values.Should().OnlyContain(x => x == long.Parse(samplingInterval) * 1_000_000);
Expand All @@ -61,7 +91,34 @@ public void CheckCpuSamplesForDefaultSampingInterval(string appName, string fram

var expectedInterval = long.Parse(samplingInterval) * 1_000_000;
// only cpu profiler enabled so should see 1 value per sample and
foreach (var (_, _, values) in SamplesHelper.GetSamples(runner.Environment.PprofDir))
var samples = SamplesHelper.GetSamples(runner.Environment.PprofDir);
samples.Should().NotBeEmpty();
foreach (var (_, _, values) in samples)
{
values.Length.Should().Be(1);
values.Should().OnlyContain(x => x == expectedInterval);
}
}

[TestAppFact("Samples.Computer01")]
public void CheckDefaultCpuSamplingInterval(string appName, string framework, string appAssembly)
{
var runner = new TestApplicationRunner(appName, framework, appAssembly, _output, commandLine: CmdLine);
var samplingInterval = "9"; // ms (default)
// disable default profilers except CPU
EnvironmentHelper.DisableDefaultProfilers(runner);
runner.Environment.SetVariable(EnvironmentVariables.CpuProfilerType, "TimerCreate");
runner.Environment.SetVariable(EnvironmentVariables.CpuProfilerEnabled, "1");

using var agent = MockDatadogAgent.CreateHttpAgent(_output);

runner.Run(agent);

var expectedInterval = long.Parse(samplingInterval) * 1_000_000;
// only cpu profiler enabled so should see 1 value per sample and
var samples = SamplesHelper.GetSamples(runner.Environment.PprofDir);
samples.Should().NotBeEmpty();
foreach (var (_, _, values) in samples)
{
values.Length.Should().Be(1);
values.Should().OnlyContain(x => x == expectedInterval);
Expand Down

0 comments on commit 8d9670a

Please sign in to comment.