#include #include #include #include #include #include #include "archive.hpp" // for Demangle #include "paje_trace.hpp" #include "profiler.hpp" #include "mpi_wrapper.hpp" extern const char *header; constexpr int MPI_PAJE_WRITER = 1; namespace ngcore { static std::string GetTimerName( int id ) { #ifndef PARALLEL return NgProfiler::GetName(id); #else // PARALLEL if(id(std::numeric_limits::max()), max_tracefile_size/bytes_per_event/(nthreads+1+trace_thread_counter*nthreads)*10/7); if(max_num_events_per_thread>0) { logger->info( "Tracefile size = {}MB", max_tracefile_size/1024/1024); logger->info( "Tracing {} events per thread", max_num_events_per_thread); } tasks.resize(nthreads); int reserve_size = std::min(1000000U, max_num_events_per_thread); for(auto & t : tasks) t.reserve(reserve_size); links.resize(nthreads); for(auto & l : links) l.reserve(reserve_size); jobs.reserve(reserve_size); timer_events.reserve(reserve_size); // sync start time when running in parallel #ifdef PARALLEL NgMPI_Comm comm(MPI_COMM_WORLD); for(auto i : Range(5)) comm.Barrier(); #endif // PARALLEL start_time = GetTimeCounter(); tracing_enabled = true; } PajeTrace :: ~PajeTrace() { for(auto & ltask : tasks) for(auto & task : ltask) { task.start_time -= start_time; task.stop_time -= start_time; } for(auto & job : jobs) { job.start_time -= start_time; job.stop_time -= start_time; } for(auto & event : timer_events) event.time -= start_time; for(auto & llink : links) for(auto & link : llink) link.time -= start_time; NgMPI_Comm comm(MPI_COMM_WORLD); if(comm.Size()==1) { Write(tracefile_name); } else { // make sure the timer id is unique across all ranks for(auto & event : timer_events) event.timer_id += NgProfiler::SIZE*comm.Rank(); if(comm.Rank() == MPI_PAJE_WRITER) Write(tracefile_name); else SendData(); } } void PajeTrace::StopTracing() { if(tracing_enabled && max_num_events_per_thread>0) { logger->warn("Maximum number of traces reached, tracing is stopped now."); } tracing_enabled = false; } class PajeFile { public: static void Hue2RGB ( double x, double &r, double &g, double &b ) { double d = 1.0/6.0; if(x logger = GetLogger("PajeTrace"); double ConvertTime(TTimePoint t) { // return time in milliseconds as double // return std::chrono::duration(t-start_time).count()*1000.0; // return std::chrono::duration(t-start_time).count() / 2.7e3; return 1000.0*static_cast(t) * seconds_per_tick; } enum PType { SET_VARIABLE=1, ADD_VARIABLE, SUB_VARIABLE, PUSH_STATE, POP_STATE, START_LINK, STOP_LINK }; struct PajeEvent { PajeEvent( int aevent_type, double atime, int atype, int acontainer, double avar_value ) : time(atime), var_value(avar_value), event_type(aevent_type), type(atype), container(acontainer) { } PajeEvent( int aevent_type, double atime, int atype, int acontainer, int avalue = 0, int aid = 0, bool avalue_is_alias = true ) : time(atime), event_type(aevent_type), type(atype), container(acontainer), value(avalue), id(aid), value_is_alias(avalue_is_alias) { } PajeEvent( int aevent_type, double atime, int atype, int acontainer, int avalue, int astart_container, int akey ) : time(atime), event_type(aevent_type), type(atype), container(acontainer), value(avalue), start_container(astart_container), id(akey) { } double time; double var_value = 0.0; int event_type; int type; int container; int value = 0; int start_container = 0; int id = 0; bool value_is_alias = true; bool operator < (const PajeEvent & other) const { // Same start and stop times can occur for very small tasks -> take "starting" events first (eg. PajePushState before PajePopState) if(time == other.time) return event_type < other.event_type; return (time < other.time); } int write(FILE *stream) { const int &key = id; const int &end_container = start_container; switch(event_type) { case PajeSetVariable: return fprintf( stream, "%d\t%.15g\ta%d\ta%d\t%.15g\n", PajeSetVariable, time, type, container, var_value ); // NOLINT case PajeAddVariable: return fprintf( stream, "%d\t%.15g\ta%d\ta%d\t%.15g\n", PajeAddVariable, time, type, container, var_value ); // NOLINT case PajeSubVariable: return fprintf( stream, "%d\t%.15g\ta%d\ta%d\t%.15g\n", PajeSubVariable, time, type, container, var_value ); // NOLINT case PajePushState: if(value_is_alias) return fprintf( stream, "%d\t%.15g\ta%d\ta%d\ta%d\t%d\n", PajePushState, time, type, container, value, id); // NOLINT else return fprintf( stream, "%d\t%.15g\ta%d\ta%d\t%d\t%d\n", PajePushState, time, type, container, value, id); // NOLINT case PajePopState: return fprintf( stream, "%d\t%.15g\ta%d\ta%d\n", PajePopState, time, type, container ); // NOLINT case PajeStartLink: return fprintf( stream, "%d\t%.15g\ta%d\ta%d\t%d\ta%d\t%d\n", PajeStartLink, time, type, container, value, start_container, key ); // NOLINT case PajeEndLink: return fprintf( stream, "%d\t%.15g\ta%d\ta%d\t%d\ta%d\t%d\n", PajeEndLink, time, type, container, value, end_container, key ); // NOLINT } return 0; } }; std::vector events; public: PajeFile() = delete; PajeFile(const PajeFile &) = delete; PajeFile(PajeFile &&) = delete; void operator=(const PajeFile &) = delete; void operator=(PajeFile &&) = delete; PajeFile( const std::string & filename) { ctrace_stream = fopen (filename.c_str(),"w"); // NOLINT fprintf(ctrace_stream, "%s", header ); // NOLINT alias_counter = 0; } ~PajeFile() { fclose (ctrace_stream); // NOLINT } int DefineContainerType ( int parent_type, const std::string & name ) { int alias = ++alias_counter; if(parent_type!=0) fprintf( ctrace_stream, "%d\ta%d\ta%d\t\"%s\"\n", PajeDefineContainerType, alias, parent_type, name.c_str() ); // NOLINT else fprintf( ctrace_stream, "%d\ta%d\t%d\t\"%s\"\n", PajeDefineContainerType, alias, parent_type, name.c_str() ); // NOLINT return alias; } int DefineVariableType ( int container_type, const std::string & name ) { int alias = ++alias_counter; fprintf( ctrace_stream, "%d\ta%d\ta%d\t\"%s\"\t\"1.0 1.0 1.0\"\n", PajeDefineVariableType, alias, container_type, name.c_str() ); // NOLINT return alias; } int DefineStateType ( int type, const std::string & name ) { int alias = ++alias_counter; fprintf( ctrace_stream, "%d\ta%d\ta%d\t\"%s\"\n", PajeDefineStateType, alias, type, name.c_str() ); // NOLINT return alias; } // int DefineEventType () // { // Write("event not implemented"); // } int DefineLinkType (int parent_container_type, int start_container_type, int stop_container_type, const std::string & name) { int alias = ++alias_counter; fprintf( ctrace_stream, "%d\ta%d\ta%d\ta%d\ta%d\t\"%s\"\n", PajeDefineLinkType, alias, parent_container_type, start_container_type, stop_container_type, name.c_str() ); // NOLINT return alias; } int DefineEntityValue (int type, const std::string & name, double hue = -1) { if(hue==-1) { std::hash shash; size_t h = shash(name); h ^= h>>32U; h = static_cast(h); hue = h*1.0/std::numeric_limits::max(); } int alias = ++alias_counter; double r; double g; double b; Hue2RGB( hue, r, g, b ); fprintf( ctrace_stream, "%d\ta%d\ta%d\t\"%s\"\t\"%.15g %.15g %.15g\"\n", PajeDefineEntityValue, alias, type, name.c_str(), r,g,b ); // NOLINT return alias; } int CreateContainer ( int type, int parent, const std::string & name ) { int alias = ++alias_counter; if(parent!=0) fprintf( ctrace_stream, "%d\t0\ta%d\ta%d\ta%d\t\"%s\"\n", PajeCreateContainer, alias, type, parent, name.c_str() ); // NOLINT else fprintf( ctrace_stream, "%d\t0\ta%d\ta%d\t%d\t\"%s\"\n", PajeCreateContainer, alias, type, parent, name.c_str() ); // NOLINT return alias; } void DestroyContainer () {} void SetVariable (TTimePoint time, int type, int container, double value ) { events.emplace_back( PajeEvent( PajeSetVariable, ConvertTime(time), type, container, value ) ); } void AddVariable (TTimePoint time, int type, int container, double value ) { events.emplace_back( PajeEvent( PajeAddVariable, ConvertTime(time), type, container, value ) ); } void SubVariable (TTimePoint time, int type, int container, double value ) { events.emplace_back( PajeEvent( PajeSubVariable, ConvertTime(time), type, container, value ) ); } void SetState () {} void PushState ( TTimePoint time, int type, int container, int value, int id = 0, bool value_is_alias = true ) { events.emplace_back( PajeEvent( PajePushState, ConvertTime(time), type, container, value, id, value_is_alias) ); } void PopState ( TTimePoint time, int type, int container ) { events.emplace_back( PajeEvent( PajePopState, ConvertTime(time), type, container ) ); } void ResetState () {} void StartLink ( TTimePoint time, int type, int container, int value, int start_container, int key ) { events.emplace_back( PajeEvent( PajeStartLink, ConvertTime(time), type, container, value, start_container, key ) ); } void EndLink ( TTimePoint time, int type, int container, int value, int end_container, int key ) { events.emplace_back( PajeEvent( PajeEndLink, ConvertTime(time), type, container, value, end_container, key ) ); } void NewEvent () {} void WriteEvents() { logger->info("Sorting traces..."); std::sort (events.begin(), events.end()); logger->info("Writing traces... "); for (auto & event : events) { event.write( ctrace_stream ); // fprintf( ctrace_stream, "%s", buf ); // NOLINT } logger->info("Done"); } private: enum { PajeDefineContainerType = 0, PajeDefineVariableType = 1, PajeDefineStateType = 2, PajeDefineEventType = 3, PajeDefineLinkType = 4, PajeDefineEntityValue = 5, PajeCreateContainer = 6, PajeDestroyContainer = 7, PajeSetVariable = 8, PajeAddVariable = 9, PajeSubVariable = 10, PajeSetState = 11, PajePushState = 12, PajePopState = 13, PajeResetState = 14, PajeStartLink = 15, PajeEndLink = 16, PajeNewEvent = 17 }; }; NGCORE_API PajeTrace *trace; void PajeTrace::Write( const std::string & filename ) { auto n_events = jobs.size() + timer_events.size(); for(auto & vtasks : tasks) n_events += vtasks.size(); logger->info("{} events traced", n_events); if(n_events==0) { logger->info("No data traced, skip writing trace file"); return; } if(!tracing_enabled) { logger->warn("Tracing stopped during computation due to tracefile size limit of {} megabytes.", max_tracefile_size/1024/1024); } PajeFile paje(filename); const int container_type_task_manager = paje.DefineContainerType( 0, "Task Manager" ); const int container_type_node = paje.DefineContainerType( container_type_task_manager, "Node"); const int container_type_thread = paje.DefineContainerType( container_type_task_manager, "Thread"); const int container_type_timer = container_type_thread; //paje.DefineContainerType( container_type_task_manager, "Timers"); const int container_type_jobs = paje.DefineContainerType( container_type_task_manager, "Jobs"); const int state_type_job = paje.DefineStateType( container_type_jobs, "Job" ); const int state_type_task = paje.DefineStateType( container_type_thread, "Task" ); const int state_type_timer = paje.DefineStateType( container_type_timer, "Timer state" ); int variable_type_active_threads = 0; if(trace_thread_counter) paje.DefineVariableType( container_type_jobs, "Active threads" ); const int container_task_manager = paje.CreateContainer( container_type_task_manager, 0, "The task manager" ); const int container_jobs = paje.CreateContainer( container_type_jobs, container_task_manager, "Jobs" ); if(trace_thread_counter) paje.SetVariable( 0, variable_type_active_threads, container_jobs, 0.0 ); int num_nodes = 1; //task_manager ? task_manager->GetNumNodes() : 1; std::vector thread_aliases; std::vector container_nodes; #ifdef PARALLEL // Hostnames NgMPI_Comm comm(MPI_COMM_WORLD); auto rank = comm.Rank(); auto nranks = comm.Size(); if(nranks>1) { nthreads = nranks; thread_aliases.reserve(nthreads); std::array ahostname; int len; MPI_Get_processor_name(ahostname.data(), &len); std::string hostname = ahostname.data(); std::map host_map; std::string name; for(auto i : IntRange(0, nranks)) { if(i!=MPI_PAJE_WRITER) comm.Recv(name, i, 0); else name = hostname; if(host_map.count(name)==0) { host_map[name] = container_nodes.size(); container_nodes.emplace_back( paje.CreateContainer( container_type_node, container_task_manager, name) ); } thread_aliases.emplace_back( paje.CreateContainer( container_type_thread, container_nodes[host_map[name]], "Rank " + ToString(i) ) ); } } else #endif // PARALLEL { container_nodes.reserve(num_nodes); for(int i=0; i job_map; std::map job_task_map; for(Job & j : jobs) if(job_map.find(j.type) == job_map.end()) { std::string name = Demangle(j.type->name()); job_map[j.type] = paje.DefineEntityValue( state_type_job, name, -1 ); job_task_map[j.type] = paje.DefineEntityValue( state_type_task, name, -1 ); } for(Job & j : jobs) { paje.PushState( j.start_time, state_type_job, container_jobs, job_map[j.type] ); paje.PopState( j.stop_time, state_type_job, container_jobs ); } std::set timer_ids; std::map timer_aliases; std::map timer_names; for(auto & event : timer_events) timer_ids.insert(event.timer_id); // Timer names for(auto & vtasks : tasks) for (Task & t : vtasks) if(t.id_type==Task::ID_TIMER) timer_ids.insert(t.id); for(auto id : timer_ids) timer_names[id] = GetTimerName(id); #ifdef PARALLEL if(nranks>1) { for(auto src : IntRange(0, nranks)) { if(src==MPI_PAJE_WRITER) continue; size_t n_timers; comm.Recv (n_timers, src, 0); int id; std::string name; for(auto i : IntRange(n_timers)) { comm.Recv (id, src, 0); comm.Recv (name, src, 0); timer_ids.insert(id); timer_names[id] = name; } } } #endif // PARALLEL for(auto id : timer_ids) timer_aliases[id] = paje.DefineEntityValue( state_type_timer, timer_names[id], -1 ); int timerdepth = 0; int maxdepth = 0; for(auto & event : timer_events) { if(event.is_start) { timerdepth++; maxdepth = timerdepth>maxdepth ? timerdepth : maxdepth; } else timerdepth--; } std::vector timer_container_aliases; timer_container_aliases.resize(maxdepth); for(int i=0; i1) { for(auto & event : timer_events) { if(event.is_start) paje.PushState( event.time, state_type_timer, thread_aliases[MPI_PAJE_WRITER], timer_aliases[event.timer_id] ); else paje.PopState( event.time, state_type_timer, thread_aliases[MPI_PAJE_WRITER] ); } // Timer events Array timer_id; Array time; Array is_start; Array thread_id; for(auto src : IntRange(0, nranks)) { if(src==MPI_PAJE_WRITER) continue; comm.Recv (timer_id, src, 0); comm.Recv (time, src, 0); comm.Recv (is_start, src, 0); comm.Recv (thread_id, src, 0); for(auto i : Range(timer_id.Size())) { TimerEvent event; event.timer_id = timer_id[i]; event.time = time[i]; event.is_start = is_start[i]; event.thread_id = thread_id[i]; if(event.is_start) paje.PushState( event.time, state_type_timer, thread_aliases[src], timer_aliases[event.timer_id] ); else paje.PopState( event.time, state_type_timer, thread_aliases[src] ); } } } #endif // PARALLEL // Merge link event int nlinks = 0; for( auto & l : links) nlinks += l.size(); std::vector links_merged; links_merged.reserve(nlinks); std::vector pos(nthreads); int nlinks_merged = 0; while(nlinks_merged < nlinks) { int minpos = -1; TTimePoint mintime = -1; for (int t = 0; t started_links; int link_type = paje.DefineLinkType(container_type_node, container_type_thread, container_type_thread, "links"); // match links for ( auto & l : links_merged ) { if(l.is_start) { started_links.push_back(l); } else { unsigned int i = 0; while(i ahostname; int len; MPI_Get_processor_name(ahostname.data(), &len); hostname = ahostname.data(); } comm.Send(hostname, MPI_PAJE_WRITER, 0); // Timer names std::set timer_ids; std::map timer_names; for(auto & event : timer_events) timer_ids.insert(event.timer_id); for(auto id : timer_ids) timer_names[id] = GetTimerName(id); size_t size = timer_ids.size(); comm.Send(size, MPI_PAJE_WRITER, 0); for(auto id : timer_ids) { comm.Send(id, MPI_PAJE_WRITER, 0); comm.Send(timer_names[id], MPI_PAJE_WRITER, 0); } // Timer events Array timer_id; Array time; Array is_start; Array thread_id; for(auto & event : timer_events) { timer_id.Append(event.timer_id); time.Append(event.time); is_start.Append(event.is_start); thread_id.Append(event.thread_id); } comm.Send (timer_id, MPI_PAJE_WRITER, 0); comm.Send (time, MPI_PAJE_WRITER, 0); comm.Send (is_start, MPI_PAJE_WRITER, 0); comm.Send (thread_id, MPI_PAJE_WRITER, 0); #endif // PARALLEL } /////////////////////////////////////////////////////////////////// // Write HTML file drawing a sunburst chart with cumulated timings struct TreeNode { int id = 0; std::map children; double chart_size = 0.0; // time without children (the chart lib accumulates children sizes again) double time = 0.0; double min_time = 1e99; double max_time = 0.0; size_t calls = 0; std::string name; TTimePoint start_time = 0; }; void PrintNode (const TreeNode &n, int &level, std::ofstream & f); void PrintNode (const TreeNode &n, int &level, std::ofstream & f) { f << "{ name: \"" + n.name + "\""; f << ", calls: " << n.calls; f << ", size: " << n.chart_size; f << ", time: " << n.time; f << ", min: " << n.min_time; f << ", max: " << n.max_time; f << ", avg: " << n.time/n.calls; int size = n.children.size(); if(size>0) { int i = 0; f << ", children: ["; for(auto & c : n.children) { PrintNode(c.second, level, f); if(++i events; TreeNode root; root.name="all"; TreeNode *current = &root; std::vector node_stack; node_stack.push_back(&root); TTimePoint stop_time = 0; for(auto & event : timer_events) { events.push_back(event); stop_time = std::max(event.time, stop_time); } std::map jobs_map; std::vector job_names; for(auto & job : jobs) { auto name = Demangle(job.type->name()); int id = job_names.size(); if(jobs_map.count(name)==0) { jobs_map[name] = id; job_names.push_back(name); } else id = jobs_map[name]; events.push_back(TimerEvent{-1, job.start_time, true, id}); events.push_back(TimerEvent{-1, job.stop_time, false, id}); stop_time = std::max(job.stop_time, stop_time); } std::sort (events.begin(), events.end()); root.time = 1000.0*static_cast(stop_time) * seconds_per_tick; root.calls = 1; root.min_time = root.time; root.max_time = root.time; for(auto & event : events) { bool is_timer_event = event.timer_id != -1; int id = is_timer_event ? event.timer_id : event.thread_id; if(event.is_start) { bool need_init = !current->children.count(id); node_stack.push_back(current); current = ¤t->children[id]; if(need_init) { current->name = is_timer_event ? GetTimerName(id) : job_names[id]; current->time = 0.0; current->id = id; } current->start_time = event.time; } else { if(node_stack.size()==0) { std::cout << "node stack empty!" << std::endl; break; } double time = 1000.0*static_cast(event.time-current->start_time) * seconds_per_tick; current->time += time; current->chart_size += time; current->min_time = std::min(current->min_time, time); current->max_time = std::max(current->max_time, time); current->calls++; current = node_stack.back(); current->chart_size -= time; node_stack.pop_back(); } } root.chart_size = 0.0; int level = 0; std::ofstream f(tracefile_name+".html"); f.precision(4); f << R"CODE_(
)CODE_" << std::endl; } } // namespace ngcore const char *header = "%EventDef PajeDefineContainerType 0 \n" "% Alias string \n" "% Type string \n" "% Name string \n" "%EndEventDef \n" "%EventDef PajeDefineVariableType 1 \n" "% Alias string \n" "% Type string \n" "% Name string \n" "% Color color \n" "%EndEventDef \n" "%EventDef PajeDefineStateType 2 \n" "% Alias string \n" "% Type string \n" "% Name string \n" "%EndEventDef \n" "%EventDef PajeDefineEventType 3 \n" "% Alias string \n" "% Type string \n" "% Name string \n" "% Color color \n" "%EndEventDef \n" "%EventDef PajeDefineLinkType 4 \n" "% Alias string \n" "% Type string \n" "% StartContainerType string \n" "% EndContainerType string \n" "% Name string \n" "%EndEventDef \n" "%EventDef PajeDefineEntityValue 5 \n" "% Alias string \n" "% Type string \n" "% Name string \n" "% Color color \n" "%EndEventDef \n" "%EventDef PajeCreateContainer 6 \n" "% Time date \n" "% Alias string \n" "% Type string \n" "% Container string \n" "% Name string \n" "%EndEventDef \n" "%EventDef PajeDestroyContainer 7 \n" "% Time date \n" "% Type string \n" "% Name string \n" "%EndEventDef \n" "%EventDef PajeSetVariable 8 \n" "% Time date \n" "% Type string \n" "% Container string \n" "% Value double \n" "%EndEventDef\n" "%EventDef PajeAddVariable 9 \n" "% Time date \n" "% Type string \n" "% Container string \n" "% Value double \n" "%EndEventDef\n" "%EventDef PajeSubVariable 10 \n" "% Time date \n" "% Type string \n" "% Container string \n" "% Value double \n" "%EndEventDef\n" "%EventDef PajeSetState 11 \n" "% Time date \n" "% Type string \n" "% Container string \n" "% Value string \n" "%EndEventDef\n" "%EventDef PajePushState 12 \n" "% Time date \n" "% Type string \n" "% Container string \n" "% Value string \n" "% Id string \n" "%EndEventDef\n" "%EventDef PajePopState 13 \n" "% Time date \n" "% Type string \n" "% Container string \n" "%EndEventDef\n" "%EventDef PajeResetState 14 \n" "% Time date \n" "% Type string \n" "% Container string \n" "%EndEventDef\n" "%EventDef PajeStartLink 15 \n" "% Time date \n" "% Type string \n" "% Container string \n" "% Value string \n" "% StartContainer string \n" "% Key string \n" "%EndEventDef\n" "%EventDef PajeEndLink 16 \n" "% Time date \n" "% Type string \n" "% Container string \n" "% Value string \n" "% EndContainer string \n" "% Key string \n" "%EndEventDef\n" "%EventDef PajeNewEvent 17 \n" "% Time date \n" "% Type string \n" "% Container string \n" "% Value string \n" "%EndEventDef\n";