Commit d19d5748 authored by Ruben Laso Rodríguez's avatar Ruben Laso Rodríguez
Browse files

Merge branch 'memory_migrations'

parents 137b2c1a c8f16d75
......@@ -3,13 +3,14 @@ execbase = migration_tool
execbase_512 = migration_tool
just_ins = migration_tool_I
just_ins_512 = migration_tool_I
gprof = migration_tool
CC = c++
LIBS =
INCLUDE = -I./
SRC = migration_tool.cpp
CPP_FLAGS = -std=c++17 -O3 -march=native -ffast-math # -g -pg -fno-inline #-fopt-info #-O0 -Wall # Debug options
CPP_FLAGS = -std=c++17 -O3 -march=native -ffast-math -DNDEBUG
LIB_FLAGS = $(LIBS) -lnuma -lpfm $(INCLUDE)
default:
......@@ -24,5 +25,8 @@ just_ins:
just_ins_512:
$(CC) $(CPP_FLAGS) -o $(execbase) $(SRC) $(LIB_FLAGS) -DUSE_512B_INS
gprof:
$(CC) $(CPP_FLAGS) -o $(execbase) $(SRC) $(LIB_FLAGS) -g -pg
clean:
rm $(execbase) *.o
/*
* ----------------------------------------------------------------------------
* "THE BEER-WARE LICENSE" (Revision 42):
* <r.laso@usc.es> wrote this file. As long as you retain this notice you
* can do whatever you want with this stuff. If we meet some day, and you think
* this stuff is worth it, you can buy me a beer in return Ruben Laso
* ----------------------------------------------------------------------------
*/
#ifndef MEM_MIGRATIONS_MEM_MIGRATION_CELL_HPP
#define MEM_MIGRATIONS_MEM_MIGRATION_CELL_HPP
#include <vector>
#include "migration/migration_var.hpp"
#include "migration/tickets.hpp"
#include "performance/performance.hpp"
#include "system_info/system_info.hpp"
#include "utils/Verbose.hpp"
#include "utils/types.hpp"
namespace migration {
class mem_migration_cell {
private:
static constexpr size_t DEFAULT_PREFETCH = 0;
addr_t addr_; // Address of the page to migrate
pid_t pid_; // PID of the process owning the memory page
node_t dst_; // Destination node
node_t src_; // Previous node
size_t prefetch_; // Number of pages to prefetch
std::vector<double> ratios_; // Accesses ratios...
public:
mem_migration_cell() = delete;
mem_migration_cell(const addr_t addr, const pid_t pid, const node_t src, const node_t dst,
const std::vector<double> & ratios) :
addr_(addr),
pid_(pid), src_(src), dst_(dst), prefetch_(DEFAULT_PREFETCH), ratios_(ratios){};
mem_migration_cell(const addr_t addr, const pid_t pid, const node_t src, const node_t dst,
const size_t prefetch, const std::vector<double> & ratios) :
addr_(addr),
pid_(pid), src_(src), dst_(dst), prefetch_(prefetch), ratios_(ratios){};
inline bool migrate() const {
bool ret = memory_info::move_pages(addr_, prefetch_, pid_, dst_);
if (ret) {
migration::total_memory_migrations++;
if (VERBOSE_LVL >= VERB_LVL4) {
if (prefetch_ > 0) {
std::cout << "Migrated " << prefetch_ << " memory pages starting from ";
} else {
std::cout << "Migrated memory page ";
}
std::cout << std::hex << reinterpret_cast<void *>(addr_) << std::dec << " (PID " << pid_
<< ") to node " << dst_;
std::cout.precision(2);
std::cout << std::fixed << " (" << ratios_[dst_] * 100 << "% of the accesses) "
<< std::defaultfloat;
std::cout << "from node " << src_ << '\n';
}
} else {
if (VERBOSE_LVL >= VERB_LVL4) {
if (prefetch_ > 0) {
std::cout << "Failed to migrate " << prefetch_ << " memory pages starting from ";
} else {
std::cout << "Failed to migrate memory page ";
}
std::cout << std::hex << reinterpret_cast<void *>(addr_) << std::dec << " (PID " << pid_
<< ") to node " << dst_;
std::cout.precision(2);
std::cout << std::fixed << " (" << ratios_[dst_] * 100 << "% of the accesses) "
<< std::defaultfloat;
std::cout << "from node " << src_ << '\n';
}
}
return ret;
}
friend std::ostream & operator<<(std::ostream & os, const mem_migration_cell & mc) {
os << "Memory page migration cell."
<< " Address " << std::hex << reinterpret_cast<void *>(mc.addr_) << std::dec << " (PID " << mc.pid_
<< ") to be migrated to NODE " << mc.dst_ << " (" << mc.ratios_[mc.dst_] * 100
<< "% of the accesses). It was in NODE " << mc.src_ << ".";
return os;
}
};
} // namespace migration
#endif //MEM_MIGRATIONS_MEM_MIGRATION_CELL_HPP
This diff is collapsed.
......@@ -11,11 +11,15 @@
#ifndef MIGRATION_MIGRATION_CELL_HPP
#define MIGRATION_MIGRATION_CELL_HPP
#include "migration.hpp"
#include "migration_var.hpp"
#include <vector>
#include "migration/migration_var.hpp"
#include "migration/tickets.hpp"
#include "performance/performance.hpp"
#include "system_info/system_info.hpp"
#include "tickets.hpp"
#include "utils/Verbose.hpp"
#include "utils/types.hpp"
......
......@@ -25,12 +25,24 @@ namespace migration {
size_t total_thread_migrations = 0;
size_t total_thread_migrations_undone = 0;
size_t total_memory_migrations = 0;
size_t total_memory_migrations_undone = 0;
namespace details {
memory_data_list_t memory_list;
inst_data_list_t inst_list;
reqs_data_list_t reqs_list;
namespace thread {
memory_data_list_t memory_list;
inst_data_list_t inst_list;
reqs_data_list_t reqs_list;
time_point last_mig_time = hres_clock::now();
} // namespace thread
namespace memory {
memory_data_list_t memory_list;
time_point last_mig_time = hres_clock::now();
} // namespace memory
time_point last_mig_time = hres_clock::now();
} // namespace details
} // namespace migration
......
//
// Created by ruben on 29/04/2020.
//
#ifndef MEM_MIGRATIONS_MEMPAGES_TABLE_HPP
#define MEM_MIGRATIONS_MEMPAGES_TABLE_HPP
#include <vector>
#include "migration/utils/mem_list.hpp"
#include "system_info/system_info.hpp"
#include "utils/types.hpp"
namespace performance {
namespace memtable_details {
class row {
public:
std::vector<double> node_accesses_; // Accesses from each node
std::vector<double> ratios_;
row() : node_accesses_(system_info::num_of_memories(), 0) {
}
inline void clear() {
std::fill(node_accesses_.begin(), node_accesses_.end(), 0);
ratios_.resize(0);
}
inline void add_data(const node_t & node, const double count) {
node_accesses_[node] += count;
}
[[nodiscard]] inline node_t preferred_node() const {
return std::distance(node_accesses_.begin(),
std::max_element(node_accesses_.begin(), node_accesses_.end()));
}
[[nodiscard]] inline auto reqs_per_node() {
return node_accesses_;
}
[[nodiscard]] inline const auto & reqs_per_node() const {
return node_accesses_;
}
auto compute_ratios() {
ratios_ = std::vector<double>(node_accesses_.size(), 0);
const auto total_accesses = std::accumulate(node_accesses_.begin(), node_accesses_.end(), 0.0);
if (total_accesses == 0) {
return ratios_;
}
for (size_t n = 0; n < node_accesses_.size(); n++) {
ratios_[n] = static_cast<double>(node_accesses_[n]) / total_accesses;
}
return ratios_;
}
auto ratios() {
return ratios_.empty() ? compute_ratios() : ratios_;
}
friend std::ostream & operator<<(std::ostream & os, const row & r) {
for (size_t node = 0; node < system_info::num_of_memories(); node++) {
os << r.node_accesses_[node] << " ";
}
return os;
}
};
} // namespace memtable_details
class mempages_table {
public:
std::map<addr_t, memtable_details::row> table_;
std::map<addr_t, pid_t> page_to_last_pid_;
mempages_table() noexcept : table_() {
}
inline void update(const memory_data_list_t & mem_list, double aging_factor = 1.0) {
add_data(mem_list, aging_factor);
}
inline void add_data(const memory_data_list_t & list, double aging_factor = 1.0) {
for (const auto & sample : list.list_) {
add_data(sample, aging_factor);
}
}
inline void add_data(const memory_data_cell_t & sample, double aging_factor = 1.0) {
const addr_t page_addr = sample.addr_ & ~(static_cast<long int>(memory_info::pagesize - 1));
const auto page_node = memory_info::get_page_current_node(page_addr, sample.tid_);
if (__glibc_unlikely(page_node < 0)) {
return;
}
auto page_it = table_.find(page_addr);
if (__glibc_unlikely(page_it == table_.end())) { // = !contains(). We init the entry if it doesn't exist
table_[page_addr] = memtable_details::row();
page_it = table_.find(page_addr);
}
page_it->second.add_data(system_info::node_from_cpu(sample.cpu_), sample.reqs_ * aging_factor);
page_to_last_pid_[page_addr] = sample.tid_;
}
inline const auto & info(const addr_t page) const {
return table_.at(page);
}
inline void clear_it() {
table_.clear();
}
inline void remove_entry(const addr_t page_addr) {
table_.erase(page_addr);
}
[[nodiscard]] inline node_t preferred_node(const addr_t page_addr) const {
return const_cast<mempages_table *>(this)->table_[page_addr].preferred_node();
}
[[nodiscard]] inline auto reqs_per_node(const addr_t page_addr) {
return this->table_[page_addr].reqs_per_node();
}
[[nodiscard]] inline const auto & reqs_per_node(const addr_t page_addr) const {
return this->table_.at(page_addr).reqs_per_node();
}
[[nodiscard]] inline auto last_pid_to_access(const addr_t page_addr) const {
return page_to_last_pid_.at(page_addr);
}
friend std::ostream & operator<<(std::ostream & os, const mempages_table & t) {
for (const auto & [page_addr, row] : t.table_) {
const auto & pid = t.page_to_last_pid_.at(page_addr);
const auto & node = memory_info::get_page_current_node(page_addr, pid);
os << "Memory Page " << std::hex << reinterpret_cast<void *>(page_addr) << std::dec << " (NODE " << node
<< ", LAST PID " << pid << "): " << row << '\n';
}
return os;
}
};
} // namespace performance
#endif //MEM_MIGRATIONS_MEMPAGEs_TABLE_HPP
......@@ -20,11 +20,18 @@
#include <unistd.h>
#include "migration/performance/performance.hpp"
#include "migration/utils/inst_list.hpp"
#include "migration/utils/reqs_list.hpp"
#include "migration/utils/mem_list.hpp"
#include "performance.hpp"
#include "samples/perf_event/perf_event.hpp"
#include "system_info/system_info.hpp"
#include "utils/types.hpp"
namespace performance {
const size_t CACHE_LINE_SIZE = sysconf(_SC_LEVEL1_DCACHE_LINESIZE);
......@@ -43,9 +50,9 @@ namespace performance {
};
private:
std::vector<ins_t> flops_; // Number of Floating Point operations executed in each CPU.
std::vector<ins_t> inst_; // Number of instructions executed in each CPU.
std::vector<req_t> total_reqs_; // Total number of memory requests in each CPU.
std::vector<ins_t> flops_; // Number of Floating Point operations executed in each node.
std::vector<ins_t> inst_; // Number of instructions executed in each node.
std::vector<req_t> total_reqs_; // Total number of memory requests in each node.
std::vector<tim_t> times_; // Time (in nanoseconds) consumed by the instructions and requests.
std::vector<req_t> node_reqs_; // Number of memory requests to each memory node.
......@@ -55,8 +62,8 @@ namespace performance {
public:
rm3d() :
flops_(system_info::num_of_cpus(), 0), inst_(system_info::num_of_cpus(), 0),
total_reqs_(system_info::num_of_cpus(), 0), times_(system_info::num_of_cpus(), 0),
flops_(system_info::num_of_memories(), 0), inst_(system_info::num_of_memories(), 0),
total_reqs_(system_info::num_of_memories(), 0), times_(system_info::num_of_memories(), 0),
node_reqs_(system_info::num_of_memories(), 0), mean_lat_(system_info::num_of_memories(), 0),
perfs_(system_info::num_of_memories(), PERFORMANCE_INVALID_VALUE) {
}
......@@ -80,20 +87,20 @@ namespace performance {
}
inline void add_data(const inst_data_cell_t & data) {
const auto cpu = data.cpu_;
const auto node = system_info::node_from_cpu(data.cpu_);
if (data.flop_) {
flops_[cpu] += data.inst_ * data.multiplier_;
flops_[node] += data.inst_ * data.multiplier_;
} else {
inst_[cpu] += data.inst_ * data.multiplier_;
inst_[node] += data.inst_ * data.multiplier_;
}
times_[cpu] += data.time_;
times_[node] += data.time_;
}
inline void add_data(const reqs_data_cell_t & data) {
const auto cpu = data.cpu_;
const auto node = system_info::node_from_cpu(data.cpu_);
total_reqs_[cpu] += data.reqs_;
total_reqs_[node] += data.reqs_;
}
inline void add_data(const memory_data_cell_t & data) {
......@@ -108,24 +115,22 @@ namespace performance {
node_reqs_[node] += data.reqs_;
}
std::tuple<double, double, double> calc_perf(const cpu_t cpu) {
const auto node = system_info::node_from_cpu(cpu);
if (times_[cpu] == 0) { // No data
std::tuple<double, double, double> calc_perf(const node_t node) {
if (times_[node] == 0) { // No data
return {};
}
// If no memory samples, take minimum_latency
const auto mean_lat =
(total_reqs_.at(cpu) == 0 || mean_lat_[node] <= 0) ? samples::minimum_latency : mean_lat_[node];
(total_reqs_.at(node) == 0 || mean_lat_[node] <= 0) ? samples::minimum_latency : mean_lat_[node];
const auto scaled_reqs = scaled_node_reqs(node);
const double seconds = times_[cpu] / 1e9; // 10^9 as times are measured in nanoseconds
const double seconds = times_[node] / 1e9; // 10^9 as times are measured in nanoseconds
const double ops_per_s = static_cast<double>(inst_[cpu] + flops_[cpu]) / seconds;
const double ops_per_b = static_cast<double>(inst_[cpu] + flops_[cpu]) /
static_cast<double>(total_reqs_.at(cpu) * CACHE_LINE_SIZE);
const double ops_per_s = static_cast<double>(inst_[node] + flops_[node]) / seconds;
const double ops_per_b = static_cast<double>(inst_[node] + flops_[node]) /
static_cast<double>(total_reqs_.at(node) * CACHE_LINE_SIZE);
// Compute first the division in order to reduce chances of overflow
// as ops_per_s is expected to be huge -> ops_per_s * ops_per_b to be potentially HUGE
......@@ -135,24 +140,14 @@ namespace performance {
perfs_.at(node) = std::pow(ops_per_s, BETA) * (std::pow(ops_per_b, GAMMA) / std::pow(mean_lat, ALPHA));
}
// std::clog << inst_[cpu] << ';' << flops_[cpu] << ';' << times_[cpu] << ';' << seconds << ';'
// << total_reqs_[cpu];
// for (const auto reqs : node_reqs_) {
// std::clog << ';' << reqs;
// }
// for (const auto reqs : scaled_reqs) {
// std::clog << ';' << reqs;
// }
// std::clog << ';' << ops_per_s << ';' << ops_per_b << ';' << mean_lat_[node] << ';' << perfs_[node] << '\n';
return std::make_tuple(ops_per_s, ops_per_b, mean_lat);
}
std::tuple<double, double, double> calc_perf() {
std::tuple<double, double, double> total_data;
for (size_t cpu = 0; cpu < system_info::num_of_cpus(); cpu++) {
const auto data = calc_perf(cpu);
for (size_t node = 0; node < system_info::num_of_memories(); node++) {
const auto data = calc_perf(node);
std::get<OPS_PER_SEC>(total_data) += std::get<OPS_PER_SEC>(data);
std::get<OPS_PER_BYTE>(total_data) += std::get<OPS_PER_BYTE>(data);
std::get<MEAN_LATENCY>(total_data) += std::get<MEAN_LATENCY>(data);
......@@ -200,8 +195,8 @@ namespace performance {
return std::move(scaled_reqs);
}
[[nodiscard]] inline std::vector<req_t> scaled_node_reqs(const cpu_t cpu) const {
const auto scale = total_reqs_.at(cpu) / std::accumulate(node_reqs_.begin(), node_reqs_.end(), 0.0);
[[nodiscard]] inline std::vector<req_t> scaled_node_reqs(const node_t node) const {
const auto scale = total_reqs_.at(node) / std::accumulate(node_reqs_.begin(), node_reqs_.end(), 0.0);
auto scaled_reqs = node_reqs_;
......@@ -212,10 +207,22 @@ namespace performance {
return std::move(scaled_reqs);
}
[[nodiscard]] inline auto ops_per_s(const cpu_t cpu) const {
const double seconds = times_[cpu] / 1e9; // 10^9 as times are measured in nanoseconds
[[nodiscard]] inline auto ops_per_s(const node_t node) const {
const double seconds = times_[node] / 1e9; // 10^9 as times are measured in nanoseconds
return static_cast<double>(inst_[node] + flops_[node]) / seconds;
}
[[nodiscard]] inline auto ops_per_byte(const node_t node) const {
const double seconds = times_[node] / 1e9; // 10^9 as times are measured in nanoseconds
return static_cast<double>(inst_[node] + flops_[node]) /
static_cast<double>(total_reqs_.at(node) * CACHE_LINE_SIZE);
}
return static_cast<double>(inst_[cpu] + flops_[cpu]) / seconds;
[[nodiscard]] inline auto av_latency(const node_t node) const {
// If no memory samples, take minimum_latency
return (total_reqs_.at(node) == 0 || mean_lat_[node] <= 0) ? samples::minimum_latency : mean_lat_[node];
}
inline void reset() {
......@@ -242,20 +249,18 @@ namespace performance {
}
friend std::ostream & operator<<(std::ostream & os, const rm3d & d) {
for (size_t cpu = 0; cpu < system_info::num_of_cpus(); cpu++) {
os << "\tCPU: " << cpu << ", "
<< "FLOPS = " << d.flops_[cpu] << ", "
<< "INSTS = " << d.inst_[cpu] << ", "
<< "TOTAL_REQS = " << d.total_reqs_[cpu] << ", "
<< "TIMES = " << d.times_[cpu] << '\n';
}
os.precision(2);
os << std::fixed;
for (size_t node = 0; node < system_info::num_of_memories(); node++) {
os << "\tNODE: " << node << ", "
<< "FLOPS = " << d.flops_[node] << ", "
<< "INSTS = " << d.inst_[node] << ", "
<< "TOTAL_REQS = " << d.total_reqs_[node] << ", "
<< "TIMES = " << d.times_[node] << '\n';
os.precision(2);
os << std::fixed;
os << "\tNODE: " << node << ", PERF = " << d.perfs_[node] << '\n';
os << std::defaultfloat;
}
os << std::defaultfloat;
return os;
}
};
......
......@@ -20,11 +20,6 @@
#include "rm3d.hpp"
#include "system_info/memory_info.hpp"
// double OPS_PER_S = 0;
// double OPS_PER_B = 0;
// double MEAN_LATEN = 0;
// size_t NUM_MEASUR = 0;
namespace performance {
namespace details {
class row {
......@@ -69,17 +64,24 @@ namespace performance {
return performance_.scaled_node_reqs();
}
[[nodiscard]] inline auto ops_per_s(const cpu_t cpu) const {
return performance_.ops_per_s(cpu);
[[nodiscard]] inline auto ops_per_s(const node_t node) const {
return performance_.ops_per_s(node);
}
[[nodiscard]] inline auto ops_per_byte(const node_t node) const {
return performance_.ops_per_byte(node);
}
[[nodiscard]] inline auto av_latency(const node_t node) const {
return performance_.av_latency(node);
}
friend std::ostream & operator<<(std::ostream & os, const row & r) {
os << "Running: " << r.running_ << '\n';
os.precision(2);
os << std::fixed;
for (size_t cpu = 0; cpu < system_info::num_of_cpus(); cpu++) {
os << '\t' << "Mean latency CPU " << cpu << ": "
<< r.performance_.mean_latency(system_info::node_from_cpu(cpu)) << '\n';
for (size_t node = 0; node < system_info::num_of_memories(); node++) {
os << '\t' << "Mean latency NODE " << node << ": " << r.performance_.mean_latency(node) << '\n';
}
os << "\tPerformance:\n" << r.performance_;
os << std::defaultfloat;
......@@ -264,20 +266,6 @@ namespace performance {
mean_perf_pid_[pid] += perf;
mean_percent_cpu_pid_[pid] += percent_cpu;
/* */
// const auto new_ops_per_s = std::get<rm3d::OPS_PER_SEC>(perf_data);
// const auto new_ops_per_b = std::get<rm3d::OPS_PER_BYTE>(perf_data);
// const auto new_mean_laten = std::get<rm3d::MEAN_LATENCY>(perf_data);
// if (std::isnormal(new_ops_per_s) && std::isnormal(new_ops_per_b) && std::isnormal(new_mean_laten)) {
// OPS_PER_S = (OPS_PER_S * NUM_MEASUR + std::get<rm3d::OPS_PER_SEC>(perf_data)) / (NUM_MEASUR + 1);
// OPS_PER_B = (OPS_PER_B * NUM_MEASUR + std::get<rm3d::OPS_PER_BYTE>(perf_data)) / (NUM_MEASUR + 1);
// MEAN_LATEN = (MEAN_LATEN * NUM_MEASUR + std::get<rm3d::MEAN_LATENCY>(perf_data)) / (NUM_MEASUR + 1);
// NUM_MEASUR++;
// }
// std::clog << pid << ";" << tid << ";" << perf << ';' << new_ops_per_s << ';' << new_ops_per_b << ';'
// << new_mean_laten << '\n';
/* */
valid_perf++;
valid_perf_pid[pid]++;
}
......@@ -285,8 +273,6 @@ namespace performance {
mean_perf_ = temp_mean / valid_perf;
mean_perf_ = std::isnormal(mean_perf_) ? mean_perf_ : 1.0;
// std::clog << "Mean;" << mean_perf_ << ';' << OPS_PER_S << ';' << OPS_PER_B << ';' << MEAN_LATEN << '\n';