torch/csrc/autograd/profiler.h

#pragma once

#include <iostream>
#include <mutex>
#include <memory>
#include <vector>
#include <cstdint>
#include <string>
#include <sstream>
#include <forward_list>
#include <tuple>
#include <ATen/ATen.h>
#include <torch/csrc/WindowsTorchApiMacro.h>
#ifndef _WIN32
#include <ctime>
#endif

#include <torch/csrc/autograd/record_function.h>

typedef struct CUevent_st* CUDAEventStub;

namespace torch { namespace autograd {

struct Node;

namespace profiler {

struct TORCH_API CUDAStubs {
  virtual void record(int* device, CUDAEventStub* event, int64_t* cpu_ns) {
    fail();
  }
  virtual float elapsed(CUDAEventStub event, CUDAEventStub event2) {
    fail();
    return 0.f;
  }
  virtual void nvtxMarkA(const char* name) {
    fail();
  }
  virtual void nvtxRangePushA(const char* name) {
    fail();
  }
  virtual void nvtxRangePop() {
    fail();
  }
  virtual bool enabled() {
    return false;
  }
  virtual void onEachDevice(std::function<void(int)> op) {
    fail();
  }
  virtual void synchronize() {
    fail();
  }
  virtual ~CUDAStubs();

private:
  void fail() {
    AT_ERROR("CUDA used in profiler but not enabled.");
  }
};

TORCH_API void registerCUDAMethods(CUDAStubs* stubs);

constexpr inline size_t ceilToMultiple(size_t a, size_t b) {
  return ((a + b - 1) / b) * b;
}

#if (defined(__MACH__) && !defined(CLOCK_REALTIME)) || defined(C10_IOS)
#include <sys/time.h>
// clock_gettime is not implemented on older versions of OS X (< 10.12).
// If implemented, CLOCK_REALTIME will have already been defined.

// clock_gettime is only available on iOS 10.0 or newer. Unlike OS X, iOS can't rely on
// CLOCK_REALTIME, as it is defined no matter if clock_gettime is implemented or not
#endif

inline int64_t getTime() {
#ifdef _WIN32
  using namespace std::chrono;
  using clock = std::conditional<high_resolution_clock::is_steady, high_resolution_clock, steady_clock>::type;
  return duration_cast<nanoseconds>(clock::now().time_since_epoch()).count();
#elif (defined(__MACH__) && !defined(CLOCK_REALTIME)) || defined(C10_IOS)
  struct timeval now;
  gettimeofday(&now, NULL);
  return static_cast<int64_t>(now.tv_sec) * 1000000000 + static_cast<int64_t>(now.tv_usec) * 1000;
#else
  // clock_gettime is *much* faster than std::chrono implementation on Linux
  struct timespec t{};
  clock_gettime(CLOCK_MONOTONIC, &t);
  return static_cast<int64_t>(t.tv_sec) * 1000000000 + static_cast<int64_t>(t.tv_nsec);
#endif
}

// Old GCC versions generate warnings incorrectly
// see https://stackoverflow.com/questions/2463113/g-c0x-enum-class-compiler-warnings
#ifndef _MSC_VER
#  pragma GCC diagnostic push
#  pragma GCC diagnostic ignored "-Wattributes"
#endif
enum class TORCH_API ProfilerState {
    Disabled,
    CPU, // CPU-only profiling
    CUDA, // CPU + CUDA events
    NVTX,  // only emit NVTX markers
};

struct TORCH_API ProfilerConfig {
  ProfilerConfig(ProfilerState state, bool report_input_shapes)
      : state(state), report_input_shapes(report_input_shapes) {}
  ~ProfilerConfig();
  ProfilerState state;
  bool report_input_shapes;
};

enum class TORCH_API EventKind : uint16_t {
  Mark,
  PushRange,
  PopRange
};
#ifndef _MSC_VER
#  pragma GCC diagnostic pop
#endif

struct TORCH_API Event final {
  Event(
      EventKind kind,
      StringView name,
      uint16_t thread_id,
      bool record_cuda,
      std::vector<std::vector<int64_t>>&& shapes = {})
      : name_(std::move(name)),
        kind_(kind),
        thread_id_(thread_id),
        shapes_(shapes) {
    record(record_cuda);
  }

  void record(bool record_cuda);
  std::string kind() const {
    switch(kind_) {
      case EventKind::Mark: return "mark";
      case EventKind::PushRange: return "push";
      case EventKind::PopRange: return "pop";
    }
    throw std::runtime_error("unknown EventKind");
  }
  const char* name() const {
    return name_.str();
  }
  uint16_t thread_id() const {
    return thread_id_;
  }
  std::vector<std::vector<int64_t>> shapes() const {
    return shapes_;
  }
  double cpu_elapsed_us(const Event & e) {
    return (e.cpu_ns_ - cpu_ns_)/(1000.0);
  }
  double cuda_elapsed_us(const Event & e);
  bool has_cuda() const {
    return event != nullptr;
  }
  int device() const {
    return device_;
  }
private:
  // signed to allow for negative intervals, initialized for safety.
  int64_t cpu_ns_ = 0;
  StringView name_;
  EventKind kind_;
  uint16_t thread_id_;
  std::vector<std::vector<int64_t>> shapes_;
  int device_ = -1;
  struct CUevent_st* event = nullptr;
};

// a linked-list of fixed sized vectors, to avoid
// a std::vector resize from taking a large amount of time inside
// a profiling  event
struct RangeEventList {
  // This mutex is used to serialize access when different threads are writing
  // to the same instance of RangeEventList.
  std::mutex mutex_;
  constexpr static size_t MB = 1024 * 1024;
  constexpr static size_t event_block_size = 16 * MB;
  constexpr static size_t num_block_elements =
    event_block_size / ceilToMultiple(sizeof(Event), alignof(Event));
  static_assert(sizeof(Event[num_block_elements]) <= event_block_size,
                "num_block_elements is calculated incorrectly");
  using block_type = std::vector<Event>;

  template<typename... Args>
  void record(Args&&... args) {
    std::lock_guard<std::mutex> guard(mutex_);
    if (blocks.empty() || blocks.front().size() == num_block_elements) {
      allocBlock();
    }
    blocks.front().emplace_back(std::forward<Args>(args)...);
  }

  std::vector<Event> consolidate() {
    std::unique_lock<std::mutex> lock(mutex_);
    std::forward_list<block_type> localBlocks;
    localBlocks.swap(blocks);
    lock.unlock();
    std::vector<Event> result;

    for (auto & block : localBlocks) {
      result.insert(result.begin(),
                    std::make_move_iterator(block.begin()),
                    std::make_move_iterator(block.end()));
    }
    return result;
  }

  std::forward_list<block_type> blocks;
  private:
     // allocBlock() assumes that mutex_ is held when called, in order to prevent
    // multiple threads' block writes stomping over each other.
    void allocBlock() {
      blocks.emplace_front();
      auto & new_block = blocks.front();
      new_block.reserve(num_block_elements);
      // Materialize all pages in the new block to release jitter when recording events.
      const char * const end_ptr = reinterpret_cast<char*>(new_block.data() + num_block_elements);
      for (volatile const char * ptr = reinterpret_cast<char*>(new_block.data());
          ptr < end_ptr; ptr += 4 * 1024) {
        (*ptr);
      }
    }
};

TORCH_API RangeEventList& getEventList();
TORCH_API void mark(std::string name, bool include_cuda = true);

using thread_event_lists = std::vector<std::vector<Event>>;
// NOTE: changing profiler modes is **NOT THREAD SAFE**. You should ensure that
// there no autograd functions are being executed when these function are used.
TORCH_API void enableProfiler(ProfilerConfig);
TORCH_API thread_event_lists disableProfiler();
TORCH_API bool profilerEnabled();


// Usage:
//   {
//     RecordProfile guard("filename.trace");
//     // code you want to profile
//   }
// Then open filename.trace in chrome://tracing
struct TORCH_API RecordProfile {
  RecordProfile(std::ostream& out);
  RecordProfile(const std::string& filename);

  ~RecordProfile();
private:
  void init();
  std::unique_ptr<std::ofstream> file_;
  std::ostream& out_;
  void processEvents(const std::vector<Event*>& events);
};


} // namespace profiler
}} // namespace torch::autograd