From 1af3f3e52c7977e7ae9aecc3eed1107aea9416b4 Mon Sep 17 00:00:00 2001 From: kruda Date: Mon, 12 Apr 2021 13:09:13 +0300 Subject: [PATCH 01/26] Added base for minimal variance sampling booster --- include/LightGBM/config.h | 8 +++ src/boosting/boosting.cpp | 3 + src/boosting/mvs.cpp | 128 ++++++++++++++++++++++++++++++++++++++ src/boosting/mvs.hpp | 99 +++++++++++++++++++++++++++++ 4 files changed, 238 insertions(+) create mode 100644 src/boosting/mvs.cpp create mode 100644 src/boosting/mvs.hpp diff --git a/include/LightGBM/config.h b/include/LightGBM/config.h index 73696cdb88f8..886195b81a92 100644 --- a/include/LightGBM/config.h +++ b/include/LightGBM/config.h @@ -317,6 +317,14 @@ struct Config { // desc = **Note**: if balanced bagging is enabled, ``bagging_fraction`` will be ignored double neg_bagging_fraction = 1.0; + // check = >0.0 + // check = <=1.0 + // desc = used in MVS boosting if ``mvs_adaptive == true`` than this value is ignored + double mvs_lambda = 1e-2; + + // desc = use adaptive variant of mvs boosting + bool mvs_adaptive = false; + // alias = subsample_freq // desc = frequency for bagging // desc = ``0`` means disable bagging; ``k`` means perform bagging at every ``k`` iteration. Every ``k``-th iteration, LightGBM will randomly select ``bagging_fraction * 100 %`` of the data to use for the next ``k`` iterations diff --git a/src/boosting/boosting.cpp b/src/boosting/boosting.cpp index 91fa318a0f18..efadfe888070 100644 --- a/src/boosting/boosting.cpp +++ b/src/boosting/boosting.cpp @@ -8,6 +8,7 @@ #include "gbdt.h" #include "goss.hpp" #include "rf.hpp" +#include "mvs.hpp" namespace LightGBM { @@ -42,6 +43,8 @@ Boosting* Boosting::CreateBoosting(const std::string& type, const char* filename return new GOSS(); } else if (type == std::string("rf")) { return new RF(); + } else if (type == std::string("mvs")) { + return new MVS(); } else { return nullptr; } diff --git a/src/boosting/mvs.cpp b/src/boosting/mvs.cpp new file mode 100644 index 000000000000..d6ae84923e31 --- /dev/null +++ b/src/boosting/mvs.cpp @@ -0,0 +1,128 @@ +// +// Created by archer on 11.04.2021. +// + +#include "mvs.hpp" + +namespace LightGBM { + +static score_t CalculateThreshold(std::vector grad_values_copy, double sample_size, data_size_t* big_grad_cnt) { + std::vector *grad_values = &grad_values_copy; + double sum_low = 0.; + size_t n_high = 0; + int begin = 0; + int end = static_cast(grad_values->size()); + + while (begin != end) { + // TODO do partition in parallel + // TODO partition to three parts + int middle_begin, middle_end; + ArrayArgs::Partition(grad_values, begin, end, &middle_begin, &middle_end); + + const size_t n_middle = middle_end - middle_begin; + const size_t n_right = end - middle_end; + const score_t pivot = (*grad_values)[middle_begin]; + + // TODO do sum in parallel + double cur_left_sum = std::accumulate(&grad_values->at(begin), &grad_values->at(middle_begin), 0.0); + double sum_middle = n_middle * pivot; + + double cur_sampling_rate = (sum_low + cur_left_sum) / pivot + n_right + n_middle + n_high; + + if (cur_sampling_rate > sample_size) { + sum_low += sum_middle + cur_left_sum; + begin = middle_end; + } else { + n_high += n_right + n_middle; + end = middle_begin; + } + } + *big_grad_cnt = n_high; + return sum_low / (sample_size - n_high + MVS::kMVSEps); +} + +static double ComputeLeavesMeanSquaredValue(const Tree &tree) { + // TODO sum over leaves are leave values one dimensional + // TODO sum using openmp + double sum_values = 0.0; + for (int i = 0; i < tree.num_leaves(); ++i) { + const auto output = tree.LeafOutput(i); + sum_values += output * output; + } + return std::sqrt(sum_values / tree.num_leaves()); +} + +void MVS::ResetMVS() { + CHECK(config_->bagging_fraction > 0.0f && config_->bagging_fraction < 1.0f && config_->bagging_freq > 0); + CHECK(config_->mvs_lambda > 0.0f && config_->mvs_lambda < 1.0f); + CHECK(!balanced_bagging_); + const auto sample_size = static_cast(config_->bagging_fraction * num_data_); + CHECK_EQ(sample_size, bag_data_indices_.size()); + Log::Info("Using MVS"); + +} + +double MVS::GetLambda() { + double lambda = ComputeLeavesMeanSquaredValue(*models_.back()); + return lambda; +} + +void MVS::Bagging(int iter) { + bag_data_cnt_ = num_data_; + if (mvs_adaptive_) { + mvs_lambda_ = GetLambda(); + } + + auto left_cnt = bagging_runner_.Run( + num_data_, + [=](int, data_size_t cur_start, data_size_t cur_cnt, data_size_t *left, + data_size_t *) { + data_size_t cur_left_cout = BaggingHelper(cur_start, cur_cnt, left); + return cur_left_cout; + }, + bag_data_indices_.data()); + bag_data_cnt_ = left_cnt; + if (!is_use_subset_) { + tree_learner_->SetBaggingData(nullptr, bag_data_indices_.data(), bag_data_cnt_); + } else { + tmp_subset_->ReSize(bag_data_cnt_); + tmp_subset_->CopySubrow(train_data_, bag_data_indices_.data(), + bag_data_cnt_, false); + tree_learner_->SetBaggingData(tmp_subset_.get(), bag_data_indices_.data(), + bag_data_cnt_); + } +} + +data_size_t MVS::BaggingHelper(data_size_t start, data_size_t cnt, data_size_t *buffer) { + if (cnt <= 0) { + return 0; + } + + std::vector tmp_derivatives(cnt, 0.0f); + for (data_size_t i = 0; i < cnt; ++i) { + for (int cur_tree_id = 0; cur_tree_id < num_tree_per_iteration_; ++cur_tree_id) { + size_t idx = static_cast(cur_tree_id) * num_data_ + start * i; + tmp_derivatives[i] += gradients_[idx] * gradients_[idx] + mvs_lambda_ * hessians_[idx] * hessians_[idx]; + } + tmp_derivatives[i] = std::sqrt(tmp_derivatives[i]); + } + + auto sample_rate = static_cast(cnt * config_->bagging_fraction); + data_size_t big_grad_cnt = 0; + const auto threshold = CalculateThreshold(tmp_derivatives, static_cast(sample_rate), &big_grad_cnt); + data_size_t left_cnt = 0; + data_size_t big_weight_cnt = 0; + for (data_size_t i = 0; i < cnt; ++i) { + auto position = start + i; + if (tmp_derivatives[i] > threshold) { + buffer[left_cnt++] = position; + ++big_weight_cnt; + } else { + double proba_threshold = tmp_derivatives[i] / threshold; + data_size_t sampled = left_cnt - big_weight_cnt; + data_size_t rest_needed = ; + } + } +} + +} // namspace LightGBM \ No newline at end of file diff --git a/src/boosting/mvs.hpp b/src/boosting/mvs.hpp new file mode 100644 index 000000000000..781f01fcd36b --- /dev/null +++ b/src/boosting/mvs.hpp @@ -0,0 +1,99 @@ +/*! + * Copyright (c) 2017 Microsoft Corporation. All rights reserved. + * Licensed under the MIT License. See LICENSE file in the project root for license information. + */ +#ifndef LIGHTGBM_BOOSTING_MVSB_H_ +#define LIGHTGBM_BOOSTING_MVSB_H_ + +#include +#include +#include + +#include +#include +#include +#include +#include +#include +#include + +#include "gbdt.h" +#include "score_updater.hpp" + +namespace LightGBM { + +class MVS : public GBDT { + public: + /*! + * \brief Constructor + */ + MVS() : GBDT() { + } + + ~MVS() override = default; + + void Init(const Config *config, const Dataset *train_data, const ObjectiveFunction *objective_function, + const std::vector &training_metrics) override { + GBDT::Init(config, train_data, objective_function, training_metrics); + mvs_lambda_ = config_->mvs_lambda; + mvs_adaptive_ = config_->mvs_adaptive; + ResetMVS(); + if (objective_function_ == nullptr) { + // use customized objective function + size_t total_size = static_cast(num_data_) * num_tree_per_iteration_; + gradients_.resize(total_size, 0.0f); + hessians_.resize(total_size, 0.0f); + } + } + + void ResetTrainingData(const Dataset *train_data, const ObjectiveFunction *objective_function, + const std::vector &training_metrics) override { + GBDT::ResetTrainingData(train_data, objective_function, training_metrics); + ResetMVS(); + } + + void ResetConfig(const Config *config) override { + GBDT::ResetConfig(config); + mvs_lambda_ = config_->mvs_lambda; + mvs_adaptive_ = config_->mvs_adaptive; + ResetMVS(); + } + + void ResetMVS(); + + bool TrainOneIter(const score_t* gradients, const score_t* hessians) override { + if (gradients != nullptr) { + // use customized objective function + CHECK(hessians != nullptr && objective_function_ == nullptr); + int64_t total_size = static_cast(num_data_) * num_tree_per_iteration_; + #pragma omp parallel for schedule(static) + for (int64_t i = 0; i < total_size; ++i) { + gradients_[i] = gradients[i]; + hessians_[i] = hessians[i]; + } + return GBDT::TrainOneIter(gradients_.data(), hessians_.data()); + } else { + CHECK(hessians == nullptr); + return GBDT::TrainOneIter(nullptr, nullptr); + } + } + + data_size_t BaggingHelper(data_size_t start, data_size_t cnt, data_size_t *buffer) override; + + void Bagging(int iter) override; + // TODO move this constant to some constants + static constexpr double kMVSEps = 1e-20; + + protected: + + bool GetIsConstHessian(const ObjectiveFunction *) override { + return false; + } + + double GetLambda(); + + double mvs_lambda_; + bool mvs_adaptive_; +}; +} // namespace LightGBM +#endif // LIGHTGBM_BOOSTING_MVS_H_ \ No newline at end of file From 0ad27406adf1b5a1be71e323d6ba65497721e1d7 Mon Sep 17 00:00:00 2001 From: kruda Date: Sun, 9 May 2021 06:19:57 +0300 Subject: [PATCH 02/26] Implemented MVS booster with support for multioutput targets, deterministic execution on small datasets/ --- include/LightGBM/config.h | 2 +- src/boosting/boosting.cpp | 2 + src/boosting/mvs.cpp | 195 ++++++++++++++++++++++++++------------ src/boosting/mvs.hpp | 11 ++- src/io/config.cpp | 4 +- src/io/config_auto.cpp | 7 ++ 6 files changed, 156 insertions(+), 65 deletions(-) diff --git a/include/LightGBM/config.h b/include/LightGBM/config.h index 886195b81a92..822e8fdccd6b 100644 --- a/include/LightGBM/config.h +++ b/include/LightGBM/config.h @@ -320,7 +320,7 @@ struct Config { // check = >0.0 // check = <=1.0 // desc = used in MVS boosting if ``mvs_adaptive == true`` than this value is ignored - double mvs_lambda = 1e-2; + double mvs_lambda = 1e-4; // desc = use adaptive variant of mvs boosting bool mvs_adaptive = false; diff --git a/src/boosting/boosting.cpp b/src/boosting/boosting.cpp index efadfe888070..23627e6b6e9c 100644 --- a/src/boosting/boosting.cpp +++ b/src/boosting/boosting.cpp @@ -59,6 +59,8 @@ Boosting* Boosting::CreateBoosting(const std::string& type, const char* filename ret.reset(new GOSS()); } else if (type == std::string("rf")) { return new RF(); + } else if (type == std::string("mvs")) { + return new MVS(); } else { Log::Fatal("Unknown boosting type %s", type.c_str()); } diff --git a/src/boosting/mvs.cpp b/src/boosting/mvs.cpp index d6ae84923e31..9670c1b9d80c 100644 --- a/src/boosting/mvs.cpp +++ b/src/boosting/mvs.cpp @@ -4,83 +4,113 @@ #include "mvs.hpp" +#include + namespace LightGBM { -static score_t CalculateThreshold(std::vector grad_values_copy, double sample_size, data_size_t* big_grad_cnt) { - std::vector *grad_values = &grad_values_copy; - double sum_low = 0.; - size_t n_high = 0; - int begin = 0; - int end = static_cast(grad_values->size()); +using ConstTreeIterator = std::vector>::const_iterator; - while (begin != end) { - // TODO do partition in parallel - // TODO partition to three parts - int middle_begin, middle_end; - ArrayArgs::Partition(grad_values, begin, end, &middle_begin, &middle_end); +static double CalculateThresholdSequential(std::vector* gradients, data_size_t begin, data_size_t end, + const double sample_size) { + double current_sum_small = 0.0; + data_size_t big_grad_size = 0; - const size_t n_middle = middle_end - middle_begin; - const size_t n_right = end - middle_end; - const score_t pivot = (*grad_values)[middle_begin]; + while (begin != end) { + data_size_t middle_begin=0, middle_end=0; + ArrayArgs::Partition(gradients, begin, end, &middle_begin, &middle_end); + ++middle_begin; // for half intervals + const data_size_t n_middle = middle_end - middle_begin; + const data_size_t large_size = middle_begin - begin; - // TODO do sum in parallel - double cur_left_sum = std::accumulate(&grad_values->at(begin), &grad_values->at(middle_begin), 0.0); - double sum_middle = n_middle * pivot; + const double sum_small = std::accumulate(gradients->begin() + middle_end, gradients->begin() + end, 0.0); + const double sum_middle = (*gradients)[middle_begin] * n_middle; - double cur_sampling_rate = (sum_low + cur_left_sum) / pivot + n_right + n_middle + n_high; + const double + current_sampling_rate = (current_sum_small + sum_small) / (*gradients)[middle_begin] + big_grad_size + n_middle + large_size; - if (cur_sampling_rate > sample_size) { - sum_low += sum_middle + cur_left_sum; - begin = middle_end; - } else { - n_high += n_right + n_middle; + if (current_sampling_rate > sample_size) { + current_sum_small += sum_small + sum_middle; end = middle_begin; + } else { + big_grad_size += n_middle + large_size; + begin = middle_end; } } - *big_grad_cnt = n_high; - return sum_low / (sample_size - n_high + MVS::kMVSEps); + + return current_sum_small / (sample_size - big_grad_size + kEpsilon); } -static double ComputeLeavesMeanSquaredValue(const Tree &tree) { - // TODO sum over leaves are leave values one dimensional - // TODO sum using openmp +static double ComputeLeavesMeanSquaredValue(ConstTreeIterator begin, ConstTreeIterator end) { double sum_values = 0.0; - for (int i = 0; i < tree.num_leaves(); ++i) { - const auto output = tree.LeafOutput(i); - sum_values += output * output; + data_size_t num_leaves = (*begin)->num_leaves(); +#pragma omp parallel for schedule(static, 2048) reduction(+:sum_values) + for (data_size_t leaf_idx = 0; leaf_idx < num_leaves; ++leaf_idx) { + double leave_value = 0.0; + for (ConstTreeIterator it = begin; it != end; ++it) { + if (leaf_idx < (**it).num_leaves()) { + const double value = (*it)->LeafOutput(leaf_idx); + leave_value += value * value; + } + } + sum_values += std::sqrt(leave_value); } - return std::sqrt(sum_values / tree.num_leaves()); + return sum_values / num_leaves; } -void MVS::ResetMVS() { - CHECK(config_->bagging_fraction > 0.0f && config_->bagging_fraction < 1.0f && config_->bagging_freq > 0); - CHECK(config_->mvs_lambda > 0.0f && config_->mvs_lambda < 1.0f); - CHECK(!balanced_bagging_); - const auto sample_size = static_cast(config_->bagging_fraction * num_data_); - CHECK_EQ(sample_size, bag_data_indices_.size()); - Log::Info("Using MVS"); - +static double ComputeMeanGradValues(score_t *gradients, + score_t *hessians, + data_size_t size, + data_size_t num_tree_per_iteration) { + double sum = 0.0; +#pragma omp parallel for schedule(static, 1024) reduction(+:sum) + for (data_size_t i = 0; i < size; ++i) { + double local_hessians = 0.0, local_gradients = 0.0; + for (data_size_t j = 0; j < num_tree_per_iteration; ++j) { + size_t idx = static_cast(size) * j + i; + local_hessians += hessians[idx] * hessians[idx]; + local_gradients += gradients[idx] * gradients[idx]; + } + sum += std::sqrt(local_gradients / local_hessians); + } + return sum / size; } double MVS::GetLambda() { - double lambda = ComputeLeavesMeanSquaredValue(*models_.back()); + if (!mvs_adaptive_) { + return mvs_lambda_; + } + double lambda = + (this->iter_ > 0) ? ComputeLeavesMeanSquaredValue(models_.cend() - num_tree_per_iteration_, models_.cend()) + / config_->learning_rate + : ComputeMeanGradValues(gradients_.data(), + hessians_.data(), + num_data_, + num_tree_per_iteration_); + return lambda; } void MVS::Bagging(int iter) { + if (iter % config_->bagging_freq != 0 && !need_re_bagging_) { + return; + } + bag_data_cnt_ = num_data_; - if (mvs_adaptive_) { - mvs_lambda_ = GetLambda(); + mvs_lambda_ = GetLambda(); + + if (num_data_ <= kMaxSequentialSize) { + threshold_ = GetThreshold(0, num_data_); } auto left_cnt = bagging_runner_.Run( num_data_, [=](int, data_size_t cur_start, data_size_t cur_cnt, data_size_t *left, data_size_t *) { - data_size_t cur_left_cout = BaggingHelper(cur_start, cur_cnt, left); - return cur_left_cout; + data_size_t left_count = BaggingHelper(cur_start, cur_cnt, left); + return left_count; }, bag_data_indices_.data()); + bag_data_cnt_ = left_cnt; if (!is_use_subset_) { tree_learner_->SetBaggingData(nullptr, bag_data_indices_.data(), bag_data_cnt_); @@ -91,6 +121,8 @@ void MVS::Bagging(int iter) { tree_learner_->SetBaggingData(tmp_subset_.get(), bag_data_indices_.data(), bag_data_cnt_); } + threshold_ = 0.0; + Log::Debug("MVS Sample size %d %d", left_cnt, static_cast(config_->bagging_fraction * num_data_)); } data_size_t MVS::BaggingHelper(data_size_t start, data_size_t cnt, data_size_t *buffer) { @@ -98,31 +130,72 @@ data_size_t MVS::BaggingHelper(data_size_t start, data_size_t cnt, data_size_t * return 0; } - std::vector tmp_derivatives(cnt, 0.0f); - for (data_size_t i = 0; i < cnt; ++i) { - for (int cur_tree_id = 0; cur_tree_id < num_tree_per_iteration_; ++cur_tree_id) { - size_t idx = static_cast(cur_tree_id) * num_data_ + start * i; - tmp_derivatives[i] += gradients_[idx] * gradients_[idx] + mvs_lambda_ * hessians_[idx] * hessians_[idx]; - } - tmp_derivatives[i] = std::sqrt(tmp_derivatives[i]); - } + const double threshold = GetThreshold(start, cnt); - auto sample_rate = static_cast(cnt * config_->bagging_fraction); - data_size_t big_grad_cnt = 0; - const auto threshold = CalculateThreshold(tmp_derivatives, static_cast(sample_rate), &big_grad_cnt); data_size_t left_cnt = 0; + data_size_t right_pos = cnt; data_size_t big_weight_cnt = 0; for (data_size_t i = 0; i < cnt; ++i) { - auto position = start + i; - if (tmp_derivatives[i] > threshold) { + data_size_t position = start + i; + + double derivative = 0.0; + for (data_size_t j = 0; j < num_tree_per_iteration_; ++j) { + size_t idx = static_cast(j) * num_data_ + position; + derivative += gradients_[idx] * gradients_[idx] + mvs_lambda_ * hessians_[idx] * hessians_[idx]; + } + derivative = std::sqrt(derivative); + + if (derivative >= threshold) { buffer[left_cnt++] = position; ++big_weight_cnt; } else { - double proba_threshold = tmp_derivatives[i] / threshold; - data_size_t sampled = left_cnt - big_weight_cnt; - data_size_t rest_needed = ; + const double proba_threshold = derivative / threshold; + const double proba = bagging_rands_[position / bagging_rand_block_].NextFloat(); + if (proba < proba_threshold) { + buffer[left_cnt++] = position; + for (data_size_t tree_id = 0; tree_id < num_tree_per_iteration_; ++tree_id) { + size_t idx = static_cast(num_data_) * tree_id + position; + gradients_[idx] /= proba_threshold; + hessians_[idx] /= proba_threshold; + } + } else { + buffer[--right_pos] = position; + } + } + } + + return left_cnt; +} + +double MVS::GetThreshold(data_size_t begin, data_size_t cnt) { + data_size_t n_blocks, block_size; + Threading::BlockInfoForceSize(num_data_, bagging_rand_block_, &n_blocks, &block_size); + if (num_data_ < kMaxSequentialSize && block_size > 1 && threshold_ != 0.0) { + return threshold_; + } + + for (data_size_t i = begin; i < begin + cnt; ++i) { + tmp_derivatives_[i] = 0.0f; + for (int cur_tree_id = 0; cur_tree_id < num_tree_per_iteration_; ++cur_tree_id) { + size_t idx = static_cast(cur_tree_id) * num_data_ + i; + tmp_derivatives_[i] += gradients_[idx] * gradients_[idx] + mvs_lambda_ * hessians_[idx] * hessians_[idx]; } + tmp_derivatives_[i] = std::sqrt(tmp_derivatives_[i]); } + + double threshold = CalculateThresholdSequential(&tmp_derivatives_, begin, begin + cnt, + cnt * config_->bagging_fraction); + return threshold; +} + +void MVS::ResetMVS() { + CHECK(config_->bagging_fraction > 0.0f && config_->bagging_fraction < 1.0f && config_->bagging_freq > 0); + CHECK(config_->mvs_lambda >= 0.0f); + CHECK(!balanced_bagging_); + + bag_data_indices_.resize(num_data_); + tmp_derivatives_.resize(num_data_); + Log::Info("Using MVS"); } } // namspace LightGBM \ No newline at end of file diff --git a/src/boosting/mvs.hpp b/src/boosting/mvs.hpp index 781f01fcd36b..b08099547d25 100644 --- a/src/boosting/mvs.hpp +++ b/src/boosting/mvs.hpp @@ -61,12 +61,12 @@ class MVS : public GBDT { void ResetMVS(); - bool TrainOneIter(const score_t* gradients, const score_t* hessians) override { + bool TrainOneIter(const score_t *gradients, const score_t *hessians) override { if (gradients != nullptr) { // use customized objective function CHECK(hessians != nullptr && objective_function_ == nullptr); int64_t total_size = static_cast(num_data_) * num_tree_per_iteration_; - #pragma omp parallel for schedule(static) + #pragma omp parallel for schedule(static, 1) for (int64_t i = 0; i < total_size; ++i) { gradients_[i] = gradients[i]; hessians_[i] = hessians[i]; @@ -80,6 +80,7 @@ class MVS : public GBDT { data_size_t BaggingHelper(data_size_t start, data_size_t cnt, data_size_t *buffer) override; + void Bagging(int iter) override; // TODO move this constant to some constants static constexpr double kMVSEps = 1e-20; @@ -90,9 +91,15 @@ class MVS : public GBDT { return false; } + double GetThreshold(data_size_t begin, data_size_t end); + double GetLambda(); + static const data_size_t kMaxSequentialSize = 256000; + double mvs_lambda_; + double threshold_{0.0}; + std::vector tmp_derivatives_; bool mvs_adaptive_; }; } // namespace LightGBM diff --git a/src/io/config.cpp b/src/io/config.cpp index fbb9e339933f..93e2bd511761 100644 --- a/src/io/config.cpp +++ b/src/io/config.cpp @@ -58,7 +58,9 @@ void GetBoostingType(const std::unordered_map& params, *boosting = "goss"; } else if (value == std::string("rf") || value == std::string("random_forest")) { *boosting = "rf"; - } else { + } else if (value == std::string("mvs")) { + *boosting = "mvs"; + }else { Log::Fatal("Unknown boosting type %s", value.c_str()); } } diff --git a/src/io/config_auto.cpp b/src/io/config_auto.cpp index 06c53e84268a..6716138af439 100644 --- a/src/io/config_auto.cpp +++ b/src/io/config_auto.cpp @@ -191,6 +191,8 @@ const std::unordered_set& Config::parameter_set() { "max_depth", "min_data_in_leaf", "min_sum_hessian_in_leaf", + "mvs_lambda", + "mvs_adaptive", "bagging_fraction", "pos_bagging_fraction", "neg_bagging_fraction", @@ -345,6 +347,9 @@ void Config::GetMembersFromString(const std::unordered_map Date: Sun, 9 May 2021 07:54:49 +0300 Subject: [PATCH 03/26] Updated documentation and fixed some linting errors --- docs/Parameters.rst | 14 ++++++++++++++ include/LightGBM/config.h | 3 +++ src/boosting/mvs.cpp | 15 ++++++++------- src/boosting/mvs.hpp | 5 ++--- src/io/config.cpp | 2 +- src/io/config_auto.cpp | 17 ++++++++++------- 6 files changed, 38 insertions(+), 18 deletions(-) diff --git a/docs/Parameters.rst b/docs/Parameters.rst index db4673b8dcff..3b5af283feaf 100644 --- a/docs/Parameters.rst +++ b/docs/Parameters.rst @@ -119,6 +119,8 @@ Core Parameters - **Note**: internally, LightGBM uses ``gbdt`` mode for the first ``1 / learning_rate`` iterations + - ``mvs``, Minimal variance sampling __ + - ``linear_tree`` :raw-html:`🔗︎`, default = ``false``, type = bool, aliases: ``linear_trees`` - fit piecewise linear gradient boosting tree @@ -336,6 +338,18 @@ Learning Control Parameters - **Note**: if balanced bagging is enabled, ``bagging_fraction`` will be ignored +- ``mvs_lambda`` :raw-html:`🔗︎`, default = ``1e-4``, type = double, constraints: ``0.0 < mvs_lambda <= 1.0`` + + - used in MVS boosting if ``mvs_adaptive == true`` than this value is ignored + + - used only in ``mvs`` + +- ``mvs_adaptive`` :raw-html:`🔗︎`, default = ``false``, type = bool + + - use adaptive variant of mvs boosting + + - used only in ``mvs`` + - ``bagging_freq`` :raw-html:`🔗︎`, default = ``0``, type = int, aliases: ``subsample_freq`` - frequency for bagging diff --git a/include/LightGBM/config.h b/include/LightGBM/config.h index 2513191ddfab..be63d8ae5254 100644 --- a/include/LightGBM/config.h +++ b/include/LightGBM/config.h @@ -147,6 +147,7 @@ struct Config { // desc = ``dart``, `Dropouts meet Multiple Additive Regression Trees `__ // desc = ``goss``, Gradient-based One-Side Sampling // descl2 = **Note**: internally, LightGBM uses ``gbdt`` mode for the first ``1 / learning_rate`` iterations + // desc = ``mvs``, Minimal variance sampling __ std::string boosting = "gbdt"; // alias = linear_trees @@ -321,9 +322,11 @@ struct Config { // check = >0.0 // check = <=1.0 // desc = used in MVS boosting if ``mvs_adaptive == true`` than this value is ignored + // desc = used only in ``mvs`` double mvs_lambda = 1e-4; // desc = use adaptive variant of mvs boosting + // desc = used only in ``mvs`` bool mvs_adaptive = false; // alias = subsample_freq diff --git a/src/boosting/mvs.cpp b/src/boosting/mvs.cpp index 9670c1b9d80c..ebd464b78d9c 100644 --- a/src/boosting/mvs.cpp +++ b/src/boosting/mvs.cpp @@ -1,10 +1,11 @@ -// -// Created by archer on 11.04.2021. -// +/*! + * Copyright (c) 2021 Microsoft Corporation. All rights reserved. + * Licensed under the MIT License. See LICENSE file in the project root for license information. + */ #include "mvs.hpp" -#include +#include namespace LightGBM { @@ -16,9 +17,9 @@ static double CalculateThresholdSequential(std::vector* gradients, data data_size_t big_grad_size = 0; while (begin != end) { - data_size_t middle_begin=0, middle_end=0; + data_size_t middle_begin = 0, middle_end = 0; ArrayArgs::Partition(gradients, begin, end, &middle_begin, &middle_end); - ++middle_begin; // for half intervals + ++middle_begin; // for half intervals const data_size_t n_middle = middle_end - middle_begin; const data_size_t large_size = middle_begin - begin; @@ -198,4 +199,4 @@ void MVS::ResetMVS() { Log::Info("Using MVS"); } -} // namspace LightGBM \ No newline at end of file +} // namespace LightGBM diff --git a/src/boosting/mvs.hpp b/src/boosting/mvs.hpp index b08099547d25..2cfee917ac53 100644 --- a/src/boosting/mvs.hpp +++ b/src/boosting/mvs.hpp @@ -82,11 +82,10 @@ class MVS : public GBDT { void Bagging(int iter) override; - // TODO move this constant to some constants + static constexpr double kMVSEps = 1e-20; protected: - bool GetIsConstHessian(const ObjectiveFunction *) override { return false; } @@ -103,4 +102,4 @@ class MVS : public GBDT { bool mvs_adaptive_; }; } // namespace LightGBM -#endif // LIGHTGBM_BOOSTING_MVS_H_ \ No newline at end of file +#endif // LIGHTGBM_BOOSTING_MVS_H_ diff --git a/src/io/config.cpp b/src/io/config.cpp index 65eaa161cfb0..2a47a1226601 100644 --- a/src/io/config.cpp +++ b/src/io/config.cpp @@ -60,7 +60,7 @@ void GetBoostingType(const std::unordered_map& params, *boosting = "rf"; } else if (value == std::string("mvs")) { *boosting = "mvs"; - }else { + } else { Log::Fatal("Unknown boosting type %s", value.c_str()); } } diff --git a/src/io/config_auto.cpp b/src/io/config_auto.cpp index 9293c5220877..41a8d56e55b5 100644 --- a/src/io/config_auto.cpp +++ b/src/io/config_auto.cpp @@ -193,11 +193,11 @@ const std::unordered_set& Config::parameter_set() { "max_depth", "min_data_in_leaf", "min_sum_hessian_in_leaf", - "mvs_lambda", - "mvs_adaptive", "bagging_fraction", "pos_bagging_fraction", "neg_bagging_fraction", + "mvs_lambda", + "mvs_adaptive", "bagging_freq", "bagging_seed", "feature_fraction", @@ -350,9 +350,6 @@ void Config::GetMembersFromString(const std::unordered_map Date: Sun, 9 May 2021 08:44:25 +0300 Subject: [PATCH 04/26] fixed python sklearn documentation, tryed to fix R Cran CI --- python-package/lightgbm/sklearn.py | 1 + src/boosting/mvs.cpp | 2 ++ src/boosting/mvs.hpp | 10 +++++----- 3 files changed, 8 insertions(+), 5 deletions(-) diff --git a/python-package/lightgbm/sklearn.py b/python-package/lightgbm/sklearn.py index 2b2261736067..6cf7b5cacab2 100644 --- a/python-package/lightgbm/sklearn.py +++ b/python-package/lightgbm/sklearn.py @@ -364,6 +364,7 @@ def __init__(self, boosting_type='gbdt', num_leaves=31, max_depth=-1, 'dart', Dropouts meet Multiple Additive Regression Trees. 'goss', Gradient-based One-Side Sampling. 'rf', Random Forest. + 'mvs', Minimal Variance Sampling. num_leaves : int, optional (default=31) Maximum tree leaves for base learners. max_depth : int, optional (default=-1) diff --git a/src/boosting/mvs.cpp b/src/boosting/mvs.cpp index ebd464b78d9c..e7dc108feab3 100644 --- a/src/boosting/mvs.cpp +++ b/src/boosting/mvs.cpp @@ -11,6 +11,8 @@ namespace LightGBM { using ConstTreeIterator = std::vector>::const_iterator; +MVS::MVS() : GBDT() {} + static double CalculateThresholdSequential(std::vector* gradients, data_size_t begin, data_size_t end, const double sample_size) { double current_sum_small = 0.0; diff --git a/src/boosting/mvs.hpp b/src/boosting/mvs.hpp index 2cfee917ac53..9c584bdd6e2c 100644 --- a/src/boosting/mvs.hpp +++ b/src/boosting/mvs.hpp @@ -2,8 +2,8 @@ * Copyright (c) 2017 Microsoft Corporation. All rights reserved. * Licensed under the MIT License. See LICENSE file in the project root for license information. */ -#ifndef LIGHTGBM_BOOSTING_MVSB_H_ -#define LIGHTGBM_BOOSTING_MVSB_H_ +#ifndef LIGHTGBM_BOOSTING_MVS_H_ +#define LIGHTGBM_BOOSTING_MVS_H_ #include #include @@ -27,10 +27,10 @@ class MVS : public GBDT { /*! * \brief Constructor */ - MVS() : GBDT() { - } + MVS(); - ~MVS() override = default; + ~MVS() { + } void Init(const Config *config, const Dataset *train_data, const ObjectiveFunction *objective_function, const std::vector &training_metrics) override { From 0f2620eea2588c1bc799dbf5db2cc06ab6c3730d Mon Sep 17 00:00:00 2001 From: kruda Date: Sun, 9 May 2021 09:05:32 +0300 Subject: [PATCH 05/26] Second attempt to fix R pipeline --- R-package/R/lgb.train.R | 2 +- R-package/src/Makevars.in | 1 + 2 files changed, 2 insertions(+), 1 deletion(-) diff --git a/R-package/R/lgb.train.R b/R-package/R/lgb.train.R index c47d6ce6901e..c9724953ce7d 100644 --- a/R-package/R/lgb.train.R +++ b/R-package/R/lgb.train.R @@ -15,7 +15,7 @@ #' @param ... other parameters, see \href{https://lightgbm.readthedocs.io/en/latest/Parameters.html}{ #' the "Parameters" section of the documentation} for more information. A few key parameters: #' \itemize{ -#' \item{\code{boosting}: Boosting type. \code{"gbdt"}, \code{"rf"}, \code{"dart"} or \code{"goss"}.} +#' \item{\code{boosting}: Boosting type. \code{"gbdt"}, \code{"rf"}, \code{"dart"}, \code{"mvs"} or \code{"goss"}.} #' \item{\code{num_leaves}: Maximum number of leaves in one tree.} #' \item{\code{max_depth}: Limit the max depth for tree model. This is used to deal with #' overfitting. Tree still grow by leaf-wise.} diff --git a/R-package/src/Makevars.in b/R-package/src/Makevars.in index 2490ba0757df..7b0ef5852a2e 100644 --- a/R-package/src/Makevars.in +++ b/R-package/src/Makevars.in @@ -26,6 +26,7 @@ OBJECTS = \ boosting/gbdt_model_text.o \ boosting/gbdt_prediction.o \ boosting/prediction_early_stop.o \ + boosting/mvs.o\ io/bin.o \ io/config.o \ io/config_auto.o \ From d50769e0201aa3a28a8b10d35b8f2d4792f05a50 Mon Sep 17 00:00:00 2001 From: kruda Date: Sun, 9 May 2021 09:34:53 +0300 Subject: [PATCH 06/26] Fixed R package build for windows and linting error --- R-package/R/lgb.train.R | 3 ++- R-package/src/Makevars.win.in | 1 + 2 files changed, 3 insertions(+), 1 deletion(-) diff --git a/R-package/R/lgb.train.R b/R-package/R/lgb.train.R index c9724953ce7d..cba50e38939d 100644 --- a/R-package/R/lgb.train.R +++ b/R-package/R/lgb.train.R @@ -15,7 +15,8 @@ #' @param ... other parameters, see \href{https://lightgbm.readthedocs.io/en/latest/Parameters.html}{ #' the "Parameters" section of the documentation} for more information. A few key parameters: #' \itemize{ -#' \item{\code{boosting}: Boosting type. \code{"gbdt"}, \code{"rf"}, \code{"dart"}, \code{"mvs"} or \code{"goss"}.} +#' \item{\code{boosting}: Boosting type. \code{"gbdt"}, \code{"rf"}, \code{"dart"}, +#' \code{"mvs"} or \code{"goss"}.} #' \item{\code{num_leaves}: Maximum number of leaves in one tree.} #' \item{\code{max_depth}: Limit the max depth for tree model. This is used to deal with #' overfitting. Tree still grow by leaf-wise.} diff --git a/R-package/src/Makevars.win.in b/R-package/src/Makevars.win.in index 0fb2de926905..3ec46956b7c4 100644 --- a/R-package/src/Makevars.win.in +++ b/R-package/src/Makevars.win.in @@ -27,6 +27,7 @@ OBJECTS = \ boosting/gbdt_model_text.o \ boosting/gbdt_prediction.o \ boosting/prediction_early_stop.o \ + boosting/mvs.o\ io/bin.o \ io/config.o \ io/config_auto.o \ From f531f3a37673f1eacdaf8695ca6816cf03811709 Mon Sep 17 00:00:00 2001 From: kruda Date: Sun, 9 May 2021 10:20:42 +0300 Subject: [PATCH 07/26] Revert "Fixed R package build for windows and linting error" This reverts commit d50769e0 --- R-package/R/lgb.train.R | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) diff --git a/R-package/R/lgb.train.R b/R-package/R/lgb.train.R index cba50e38939d..c9724953ce7d 100644 --- a/R-package/R/lgb.train.R +++ b/R-package/R/lgb.train.R @@ -15,8 +15,7 @@ #' @param ... other parameters, see \href{https://lightgbm.readthedocs.io/en/latest/Parameters.html}{ #' the "Parameters" section of the documentation} for more information. A few key parameters: #' \itemize{ -#' \item{\code{boosting}: Boosting type. \code{"gbdt"}, \code{"rf"}, \code{"dart"}, -#' \code{"mvs"} or \code{"goss"}.} +#' \item{\code{boosting}: Boosting type. \code{"gbdt"}, \code{"rf"}, \code{"dart"}, \code{"mvs"} or \code{"goss"}.} #' \item{\code{num_leaves}: Maximum number of leaves in one tree.} #' \item{\code{max_depth}: Limit the max depth for tree model. This is used to deal with #' overfitting. Tree still grow by leaf-wise.} From ef1a28c1f43897cd8016cfc129fe4e7bca874db6 Mon Sep 17 00:00:00 2001 From: kruda Date: Sun, 9 May 2021 10:25:42 +0300 Subject: [PATCH 08/26] Revert "Revert "Fixed R package build for windows and linting error"" This reverts commit f531f3a37673f1eacdaf8695ca6816cf03811709. --- R-package/R/lgb.train.R | 2 +- R-package/src/Makevars.win.in | 1 - 2 files changed, 1 insertion(+), 2 deletions(-) diff --git a/R-package/R/lgb.train.R b/R-package/R/lgb.train.R index c9724953ce7d..c47d6ce6901e 100644 --- a/R-package/R/lgb.train.R +++ b/R-package/R/lgb.train.R @@ -15,7 +15,7 @@ #' @param ... other parameters, see \href{https://lightgbm.readthedocs.io/en/latest/Parameters.html}{ #' the "Parameters" section of the documentation} for more information. A few key parameters: #' \itemize{ -#' \item{\code{boosting}: Boosting type. \code{"gbdt"}, \code{"rf"}, \code{"dart"}, \code{"mvs"} or \code{"goss"}.} +#' \item{\code{boosting}: Boosting type. \code{"gbdt"}, \code{"rf"}, \code{"dart"} or \code{"goss"}.} #' \item{\code{num_leaves}: Maximum number of leaves in one tree.} #' \item{\code{max_depth}: Limit the max depth for tree model. This is used to deal with #' overfitting. Tree still grow by leaf-wise.} diff --git a/R-package/src/Makevars.win.in b/R-package/src/Makevars.win.in index 3ec46956b7c4..0fb2de926905 100644 --- a/R-package/src/Makevars.win.in +++ b/R-package/src/Makevars.win.in @@ -27,7 +27,6 @@ OBJECTS = \ boosting/gbdt_model_text.o \ boosting/gbdt_prediction.o \ boosting/prediction_early_stop.o \ - boosting/mvs.o\ io/bin.o \ io/config.o \ io/config_auto.o \ From c6100356140a74e2fc47f602e3eeccc801c3a829 Mon Sep 17 00:00:00 2001 From: kruda Date: Sun, 9 May 2021 10:28:14 +0300 Subject: [PATCH 09/26] Fixed some documentation --- docs/Parameters.rst | 2 +- include/LightGBM/config.h | 3 ++- src/boosting/mvs.hpp | 4 +--- src/io/config_auto.cpp | 1 - 4 files changed, 4 insertions(+), 6 deletions(-) diff --git a/docs/Parameters.rst b/docs/Parameters.rst index 3b5af283feaf..43ecdd83d97b 100644 --- a/docs/Parameters.rst +++ b/docs/Parameters.rst @@ -338,7 +338,7 @@ Learning Control Parameters - **Note**: if balanced bagging is enabled, ``bagging_fraction`` will be ignored -- ``mvs_lambda`` :raw-html:`🔗︎`, default = ``1e-4``, type = double, constraints: ``0.0 < mvs_lambda <= 1.0`` +- ``mvs_lambda`` :raw-html:`🔗︎`, default = ``1e-4``, type = double, constraints: ``mvs_lambda > 0.0`` - used in MVS boosting if ``mvs_adaptive == true`` than this value is ignored diff --git a/include/LightGBM/config.h b/include/LightGBM/config.h index be63d8ae5254..19479c92fe88 100644 --- a/include/LightGBM/config.h +++ b/include/LightGBM/config.h @@ -319,12 +319,13 @@ struct Config { // desc = **Note**: if balanced bagging is enabled, ``bagging_fraction`` will be ignored double neg_bagging_fraction = 1.0; + // default = 1e-4 // check = >0.0 - // check = <=1.0 // desc = used in MVS boosting if ``mvs_adaptive == true`` than this value is ignored // desc = used only in ``mvs`` double mvs_lambda = 1e-4; + // default = false // desc = use adaptive variant of mvs boosting // desc = used only in ``mvs`` bool mvs_adaptive = false; diff --git a/src/boosting/mvs.hpp b/src/boosting/mvs.hpp index 9c584bdd6e2c..3e133034f5ae 100644 --- a/src/boosting/mvs.hpp +++ b/src/boosting/mvs.hpp @@ -24,9 +24,7 @@ namespace LightGBM { class MVS : public GBDT { public: - /*! - * \brief Constructor - */ + MVS(); ~MVS() { diff --git a/src/io/config_auto.cpp b/src/io/config_auto.cpp index 41a8d56e55b5..f4fcabdf522d 100644 --- a/src/io/config_auto.cpp +++ b/src/io/config_auto.cpp @@ -361,7 +361,6 @@ void Config::GetMembersFromString(const std::unordered_map Date: Sun, 9 May 2021 15:14:22 +0300 Subject: [PATCH 10/26] Fixed intendation error in mvs.hpp, fixed some windows build issues, added spinx version upper bound --- docs/requirements_base.txt | 2 +- src/boosting/mvs.cpp | 2 ++ src/boosting/mvs.hpp | 1 - windows/LightGBM.vcxproj | 2 ++ windows/LightGBM.vcxproj.filters | 6 ++++++ 5 files changed, 11 insertions(+), 2 deletions(-) diff --git a/docs/requirements_base.txt b/docs/requirements_base.txt index baebc41b5e1c..7fa8e2b3fb9f 100644 --- a/docs/requirements_base.txt +++ b/docs/requirements_base.txt @@ -1,2 +1,2 @@ -sphinx +sphinx <= 3.5.4 sphinx_rtd_theme >= 0.5 diff --git a/src/boosting/mvs.cpp b/src/boosting/mvs.cpp index e7dc108feab3..fbbf9785fd7f 100644 --- a/src/boosting/mvs.cpp +++ b/src/boosting/mvs.cpp @@ -6,6 +6,8 @@ #include "mvs.hpp" #include +#include + namespace LightGBM { diff --git a/src/boosting/mvs.hpp b/src/boosting/mvs.hpp index 3e133034f5ae..d3e60483fe45 100644 --- a/src/boosting/mvs.hpp +++ b/src/boosting/mvs.hpp @@ -24,7 +24,6 @@ namespace LightGBM { class MVS : public GBDT { public: - MVS(); ~MVS() { diff --git a/windows/LightGBM.vcxproj b/windows/LightGBM.vcxproj index 59b589a40d51..71a521cc593f 100644 --- a/windows/LightGBM.vcxproj +++ b/windows/LightGBM.vcxproj @@ -274,6 +274,7 @@ + @@ -310,6 +311,7 @@ + diff --git a/windows/LightGBM.vcxproj.filters b/windows/LightGBM.vcxproj.filters index 0f48c7564580..9df3e1ebb628 100644 --- a/windows/LightGBM.vcxproj.filters +++ b/windows/LightGBM.vcxproj.filters @@ -42,6 +42,9 @@ src\boosting + + src\boosting + src\network @@ -248,6 +251,9 @@ src\boosting + + src\boosting + src\io From a5b72f8d604199cf1bb985d3de38f1198c8847de Mon Sep 17 00:00:00 2001 From: kruda Date: Sun, 9 May 2021 15:14:30 +0300 Subject: [PATCH 11/26] Fixed intendation error in mvs.hpp, fixed some windows build issues, added spinx version upper bound --- R-package/src/Makevars.win.in | 1 + 1 file changed, 1 insertion(+) diff --git a/R-package/src/Makevars.win.in b/R-package/src/Makevars.win.in index 0fb2de926905..3ec46956b7c4 100644 --- a/R-package/src/Makevars.win.in +++ b/R-package/src/Makevars.win.in @@ -27,6 +27,7 @@ OBJECTS = \ boosting/gbdt_model_text.o \ boosting/gbdt_prediction.o \ boosting/prediction_early_stop.o \ + boosting/mvs.o\ io/bin.o \ io/config.o \ io/config_auto.o \ From fb8ff6e4341e1e49b75a88447b73bca6891c05af Mon Sep 17 00:00:00 2001 From: kruda Date: Sun, 9 May 2021 17:52:23 +0300 Subject: [PATCH 12/26] Update requirements_base.txt --- docs/requirements_base.txt | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/docs/requirements_base.txt b/docs/requirements_base.txt index bb6dbc8e61e1..9314ee0deb35 100644 --- a/docs/requirements_base.txt +++ b/docs/requirements_base.txt @@ -1,2 +1,2 @@ -sphinx <= 4 +sphinx < 4 sphinx_rtd_theme >= 0.5 From d499d158f8267cebf7b2557380d8bb3bc0b19b20 Mon Sep 17 00:00:00 2001 From: kruda Date: Mon, 10 May 2021 16:28:32 +0300 Subject: [PATCH 13/26] Update R-package/src/Makevars.in Co-authored-by: James Lamb --- R-package/src/Makevars.in | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/R-package/src/Makevars.in b/R-package/src/Makevars.in index 7b0ef5852a2e..4cde0fff7c8e 100644 --- a/R-package/src/Makevars.in +++ b/R-package/src/Makevars.in @@ -26,7 +26,7 @@ OBJECTS = \ boosting/gbdt_model_text.o \ boosting/gbdt_prediction.o \ boosting/prediction_early_stop.o \ - boosting/mvs.o\ + boosting/mvs.o \ io/bin.o \ io/config.o \ io/config_auto.o \ From 8a01fb806c70ec018788133d6042c5e2b53309b0 Mon Sep 17 00:00:00 2001 From: kruda Date: Mon, 10 May 2021 16:28:37 +0300 Subject: [PATCH 14/26] Update R-package/src/Makevars.win.in Co-authored-by: James Lamb --- R-package/src/Makevars.win.in | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/R-package/src/Makevars.win.in b/R-package/src/Makevars.win.in index 3ec46956b7c4..8e75bd58a309 100644 --- a/R-package/src/Makevars.win.in +++ b/R-package/src/Makevars.win.in @@ -27,7 +27,7 @@ OBJECTS = \ boosting/gbdt_model_text.o \ boosting/gbdt_prediction.o \ boosting/prediction_early_stop.o \ - boosting/mvs.o\ + boosting/mvs.o \ io/bin.o \ io/config.o \ io/config_auto.o \ From 4b630a1f259967392522ff900d576cade4984942 Mon Sep 17 00:00:00 2001 From: kruda Date: Mon, 10 May 2021 16:29:25 +0300 Subject: [PATCH 15/26] Added MVS booster support for dask tests --- tests/python_package_test/test_dask.py | 20 +++++++++++++++++++- 1 file changed, 19 insertions(+), 1 deletion(-) diff --git a/tests/python_package_test/test_dask.py b/tests/python_package_test/test_dask.py index 2d3ac7c606a2..494d3a7930e1 100644 --- a/tests/python_package_test/test_dask.py +++ b/tests/python_package_test/test_dask.py @@ -41,7 +41,7 @@ tasks = ['binary-classification', 'multiclass-classification', 'regression', 'ranking'] distributed_training_algorithms = ['data', 'voting'] data_output = ['array', 'scipy_csr_matrix', 'dataframe', 'dataframe-with-categorical'] -boosting_types = ['gbdt', 'dart', 'goss', 'rf'] +boosting_types = ['gbdt', 'dart', 'goss', 'rf', 'mvs'] group_sizes = [5, 5, 5, 10, 10, 10, 20, 20, 20, 50, 50] task_to_dask_factory = { 'regression': lgb.DaskLGBMRegressor, @@ -266,6 +266,12 @@ def test_classifier(output, task, boosting_type, tree_learner, cluster): }) elif boosting_type == 'goss': params['top_rate'] = 0.5 + elif boosting_type == 'mvs': + params.update({ + 'bagging_freq' : 1, + 'mvs_adaptive' : True, + 'bagging_fraction': 0.9 + }) dask_classifier = lgb.DaskLGBMClassifier( client=client, @@ -476,6 +482,12 @@ def test_regressor(output, boosting_type, tree_learner, cluster): 'bagging_freq': 1, 'bagging_fraction': 0.9, }) + elif boosting_type == 'mvs': + params.update({ + 'bagging_freq' : 1, + 'mvs_adaptive' : True, + 'bagging_fraction': 0.9 + }) dask_regressor = lgb.DaskLGBMRegressor( client=client, @@ -671,6 +683,12 @@ def test_ranker(output, group, boosting_type, tree_learner, cluster): 'bagging_freq': 1, 'bagging_fraction': 0.9, }) + elif boosting_type == 'mvs': + params.update({ + 'bagging_freq' : 1, + 'mvs_adaptive' : True, + 'bagging_fraction': 0.9 + }) dask_ranker = lgb.DaskLGBMRanker( client=client, From 49ed4ebd9a3951ec34a36b7e6e17b5c6230e7c88 Mon Sep 17 00:00:00 2001 From: kruda Date: Mon, 10 May 2021 17:14:34 +0300 Subject: [PATCH 16/26] Moved CalculateThresholdSequential to array_args.h and renamed it to CalculateThresholdMVS --- include/LightGBM/utils/array_args.h | 31 +++++++++++++++++++++++++ src/boosting/mvs.cpp | 36 +---------------------------- 2 files changed, 32 insertions(+), 35 deletions(-) diff --git a/include/LightGBM/utils/array_args.h b/include/LightGBM/utils/array_args.h index 0183ecc22ddb..cd5ebe43a609 100644 --- a/include/LightGBM/utils/array_args.h +++ b/include/LightGBM/utils/array_args.h @@ -9,6 +9,7 @@ #include #include +#include #include #include @@ -183,6 +184,36 @@ class ArrayArgs { } return true; } + + static double CalculateThresholdMVS(std::vector* gradients, data_size_t begin, data_size_t end, + const double sample_size) { + double current_sum_small = 0.0; + data_size_t big_grad_size = 0; + + while (begin != end) { + data_size_t middle_begin = 0, middle_end = 0; + ArrayArgs::Partition(gradients, begin, end, &middle_begin, &middle_end); + ++middle_begin; // for half intervals + const data_size_t n_middle = middle_end - middle_begin; + const data_size_t large_size = middle_begin - begin; + + const double sum_small = std::accumulate(gradients->begin() + middle_end, gradients->begin() + end, 0.0); + const double sum_middle = (*gradients)[middle_begin] * n_middle; + + const double + current_sampling_rate = (current_sum_small + sum_small) / (*gradients)[middle_begin] + big_grad_size + n_middle + large_size; + + if (current_sampling_rate > sample_size) { + current_sum_small += sum_small + sum_middle; + end = middle_begin; + } else { + big_grad_size += n_middle + large_size; + begin = middle_end; + } + } + + return current_sum_small / (sample_size - big_grad_size + kEpsilon); + } }; } // namespace LightGBM diff --git a/src/boosting/mvs.cpp b/src/boosting/mvs.cpp index fbbf9785fd7f..5ecdacca1170 100644 --- a/src/boosting/mvs.cpp +++ b/src/boosting/mvs.cpp @@ -5,46 +5,12 @@ #include "mvs.hpp" -#include -#include - - namespace LightGBM { using ConstTreeIterator = std::vector>::const_iterator; MVS::MVS() : GBDT() {} -static double CalculateThresholdSequential(std::vector* gradients, data_size_t begin, data_size_t end, - const double sample_size) { - double current_sum_small = 0.0; - data_size_t big_grad_size = 0; - - while (begin != end) { - data_size_t middle_begin = 0, middle_end = 0; - ArrayArgs::Partition(gradients, begin, end, &middle_begin, &middle_end); - ++middle_begin; // for half intervals - const data_size_t n_middle = middle_end - middle_begin; - const data_size_t large_size = middle_begin - begin; - - const double sum_small = std::accumulate(gradients->begin() + middle_end, gradients->begin() + end, 0.0); - const double sum_middle = (*gradients)[middle_begin] * n_middle; - - const double - current_sampling_rate = (current_sum_small + sum_small) / (*gradients)[middle_begin] + big_grad_size + n_middle + large_size; - - if (current_sampling_rate > sample_size) { - current_sum_small += sum_small + sum_middle; - end = middle_begin; - } else { - big_grad_size += n_middle + large_size; - begin = middle_end; - } - } - - return current_sum_small / (sample_size - big_grad_size + kEpsilon); -} - static double ComputeLeavesMeanSquaredValue(ConstTreeIterator begin, ConstTreeIterator end) { double sum_values = 0.0; data_size_t num_leaves = (*begin)->num_leaves(); @@ -188,7 +154,7 @@ double MVS::GetThreshold(data_size_t begin, data_size_t cnt) { tmp_derivatives_[i] = std::sqrt(tmp_derivatives_[i]); } - double threshold = CalculateThresholdSequential(&tmp_derivatives_, begin, begin + cnt, + double threshold = ArrayArgs::CalculateThresholdMVS(&tmp_derivatives_, begin, begin + cnt, cnt * config_->bagging_fraction); return threshold; } From d018ed0bb509cf95e9f9cbddf7ebc67cb5ce9d42 Mon Sep 17 00:00:00 2001 From: kruda Date: Mon, 10 May 2021 18:58:20 +0300 Subject: [PATCH 17/26] Added cpp tests for ArrayArgs::CalculateThresholdMVS and ArrayArgs::Partition. --- include/LightGBM/utils/array_args.h | 2 +- tests/cpp_tests/test_mvs_threshold_search.cpp | 57 +++++++++++++++++++ 2 files changed, 58 insertions(+), 1 deletion(-) create mode 100644 tests/cpp_tests/test_mvs_threshold_search.cpp diff --git a/include/LightGBM/utils/array_args.h b/include/LightGBM/utils/array_args.h index cd5ebe43a609..51e13969900d 100644 --- a/include/LightGBM/utils/array_args.h +++ b/include/LightGBM/utils/array_args.h @@ -191,7 +191,7 @@ class ArrayArgs { data_size_t big_grad_size = 0; while (begin != end) { - data_size_t middle_begin = 0, middle_end = 0; + data_size_t middle_begin = begin - 1, middle_end = end; ArrayArgs::Partition(gradients, begin, end, &middle_begin, &middle_end); ++middle_begin; // for half intervals const data_size_t n_middle = middle_end - middle_begin; diff --git a/tests/cpp_tests/test_mvs_threshold_search.cpp b/tests/cpp_tests/test_mvs_threshold_search.cpp new file mode 100644 index 000000000000..088c71450a11 --- /dev/null +++ b/tests/cpp_tests/test_mvs_threshold_search.cpp @@ -0,0 +1,57 @@ +/*! + * Copyright (c) 2021 Microsoft Corporation. All rights reserved. + * Licensed under the MIT License. See LICENSE file in the project root for license information. + */ + +#include +#include +#include + +using namespace LightGBM; + +template +double ComputeExpectationOfMVS(const std::vector &grads, double threshold) { + double expectation = 0.0; + for (const auto &value: grads) { + if (value >= threshold) { + expectation += 1.; + } else { + expectation += value / threshold; + } + } + return expectation; +} + +void ComputeSamplingRate(std::vector gradients, + const double sampling_fraction, + double *expected_sample_size, + double *resulting_sample_size) { + CHECK(expected_sample_size != nullptr); + CHECK(resulting_sample_size != nullptr); + *expected_sample_size = sampling_fraction * static_cast(gradients.size()); + double threshold = ArrayArgs::CalculateThresholdMVS(&gradients, 0, gradients.size(), *expected_sample_size); + *resulting_sample_size = ComputeExpectationOfMVS(gradients, threshold); +} + +TEST(SearchThresholdMVS, Basic) { + std::vector gradients({0.5f, 5.0f, 1.0f, 2.0f, 2.0f}); + double expected, resulting; + ComputeSamplingRate(gradients, 0.5, &expected, &resulting); + EXPECT_DOUBLE_EQ(expected, resulting); +} + +TEST(ArrayArgs, Partition) { + std::vector gradients({0.5f, 5.0f, 1.0f, 2.0f, 2.0f}); + data_size_t middle_begin = -1, middle_end = gradients.size(); + ArrayArgs::Partition(&gradients, 0, gradients.size(), &middle_begin, &middle_end); + EXPECT_EQ(gradients[middle_begin + 1], gradients[middle_end - 1]); + EXPECT_GT(gradients[0], gradients[middle_begin + 1]); + EXPECT_GT(gradients[middle_begin + 1], gradients.back()); +} + +TEST(SearchThresholdMVS, PartitionOneElement) { + std::vector gradients({0.5f}); + data_size_t middle_begin = -1, middle_end = gradients.size(); + ArrayArgs::Partition(&gradients, 0, gradients.size(), &middle_begin, &middle_end); + EXPECT_EQ(gradients[middle_begin + 1], gradients[middle_end - 1]); +} \ No newline at end of file From d62c98c7d7945161ab09b6a2d2fb1348f465fcbc Mon Sep 17 00:00:00 2001 From: kruda Date: Mon, 10 May 2021 19:00:07 +0300 Subject: [PATCH 18/26] Fix linter errors in test_dask.py --- tests/python_package_test/test_dask.py | 12 ++++++------ 1 file changed, 6 insertions(+), 6 deletions(-) diff --git a/tests/python_package_test/test_dask.py b/tests/python_package_test/test_dask.py index 494d3a7930e1..4a301a70009a 100644 --- a/tests/python_package_test/test_dask.py +++ b/tests/python_package_test/test_dask.py @@ -268,8 +268,8 @@ def test_classifier(output, task, boosting_type, tree_learner, cluster): params['top_rate'] = 0.5 elif boosting_type == 'mvs': params.update({ - 'bagging_freq' : 1, - 'mvs_adaptive' : True, + 'bagging_freq': 1, + 'mvs_adaptive': True, 'bagging_fraction': 0.9 }) @@ -484,8 +484,8 @@ def test_regressor(output, boosting_type, tree_learner, cluster): }) elif boosting_type == 'mvs': params.update({ - 'bagging_freq' : 1, - 'mvs_adaptive' : True, + 'bagging_freq': 1, + 'mvs_adaptive': True, 'bagging_fraction': 0.9 }) @@ -685,8 +685,8 @@ def test_ranker(output, group, boosting_type, tree_learner, cluster): }) elif boosting_type == 'mvs': params.update({ - 'bagging_freq' : 1, - 'mvs_adaptive' : True, + 'bagging_freq': 1, + 'mvs_adaptive': True, 'bagging_fraction': 0.9 }) From 8cee27e8f23eb142355987bb8e29238e1a1e88b1 Mon Sep 17 00:00:00 2001 From: kruda Date: Mon, 10 May 2021 19:01:20 +0300 Subject: [PATCH 19/26] Fixed UB in ArrayArgs::Partition, when it is called with one element. --- include/LightGBM/utils/array_args.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/include/LightGBM/utils/array_args.h b/include/LightGBM/utils/array_args.h index 51e13969900d..c6362b84b41d 100644 --- a/include/LightGBM/utils/array_args.h +++ b/include/LightGBM/utils/array_args.h @@ -104,7 +104,7 @@ class ArrayArgs { int j = end - 1; int p = i; int q = j; - if (start >= end) { + if (start >= end - 1) { return; } std::vector& ref = *arr; From 224ac053e25ed498435cffcfc119570901046ff4 Mon Sep 17 00:00:00 2001 From: kruda Date: Mon, 10 May 2021 20:04:32 +0300 Subject: [PATCH 20/26] Fixed linter errors --- tests/cpp_tests/test_mvs_threshold_search.cpp | 14 +++++++++----- 1 file changed, 9 insertions(+), 5 deletions(-) diff --git a/tests/cpp_tests/test_mvs_threshold_search.cpp b/tests/cpp_tests/test_mvs_threshold_search.cpp index 088c71450a11..ef6f97bc2df5 100644 --- a/tests/cpp_tests/test_mvs_threshold_search.cpp +++ b/tests/cpp_tests/test_mvs_threshold_search.cpp @@ -7,12 +7,12 @@ #include #include -using namespace LightGBM; template double ComputeExpectationOfMVS(const std::vector &grads, double threshold) { + using namespace LightGBM; double expectation = 0.0; - for (const auto &value: grads) { + for (const auto &value : grads) { if (value >= threshold) { expectation += 1.; } else { @@ -22,10 +22,11 @@ double ComputeExpectationOfMVS(const std::vector &grads, double threshold return expectation; } -void ComputeSamplingRate(std::vector gradients, +void ComputeSamplingRate(std::vector gradients, const double sampling_fraction, double *expected_sample_size, double *resulting_sample_size) { + using namespace LightGBM; CHECK(expected_sample_size != nullptr); CHECK(resulting_sample_size != nullptr); *expected_sample_size = sampling_fraction * static_cast(gradients.size()); @@ -34,13 +35,15 @@ void ComputeSamplingRate(std::vector gradients, } TEST(SearchThresholdMVS, Basic) { - std::vector gradients({0.5f, 5.0f, 1.0f, 2.0f, 2.0f}); + using namespace LightGBM; + std::vector gradients({0.5f, 5.0f, 1.0f, 2.0f, 2.0f}); double expected, resulting; ComputeSamplingRate(gradients, 0.5, &expected, &resulting); EXPECT_DOUBLE_EQ(expected, resulting); } TEST(ArrayArgs, Partition) { + using namespace LightGBM; std::vector gradients({0.5f, 5.0f, 1.0f, 2.0f, 2.0f}); data_size_t middle_begin = -1, middle_end = gradients.size(); ArrayArgs::Partition(&gradients, 0, gradients.size(), &middle_begin, &middle_end); @@ -50,8 +53,9 @@ TEST(ArrayArgs, Partition) { } TEST(SearchThresholdMVS, PartitionOneElement) { + using namespace LightGBM; std::vector gradients({0.5f}); data_size_t middle_begin = -1, middle_end = gradients.size(); ArrayArgs::Partition(&gradients, 0, gradients.size(), &middle_begin, &middle_end); EXPECT_EQ(gradients[middle_begin + 1], gradients[middle_end - 1]); -} \ No newline at end of file +} From 5cd44229c07f86ecef6cc5c145e39cb04a88ee1c Mon Sep 17 00:00:00 2001 From: kruda Date: Mon, 10 May 2021 23:14:39 +0300 Subject: [PATCH 21/26] Added more cpp tests and fixed linting errors --- tests/cpp_tests/test_mvs_threshold_search.cpp | 62 ++++++++++++++++--- 1 file changed, 52 insertions(+), 10 deletions(-) diff --git a/tests/cpp_tests/test_mvs_threshold_search.cpp b/tests/cpp_tests/test_mvs_threshold_search.cpp index ef6f97bc2df5..3d25e4c9d023 100644 --- a/tests/cpp_tests/test_mvs_threshold_search.cpp +++ b/tests/cpp_tests/test_mvs_threshold_search.cpp @@ -7,10 +7,14 @@ #include #include +#include + +using LightGBM::data_size_t; +using LightGBM::score_t; +using LightGBM::ArrayArgs; template double ComputeExpectationOfMVS(const std::vector &grads, double threshold) { - using namespace LightGBM; double expectation = 0.0; for (const auto &value : grads) { if (value >= threshold) { @@ -22,18 +26,30 @@ double ComputeExpectationOfMVS(const std::vector &grads, double threshold return expectation; } -void ComputeSamplingRate(std::vector gradients, - const double sampling_fraction, - double *expected_sample_size, - double *resulting_sample_size) { - using namespace LightGBM; - CHECK(expected_sample_size != nullptr); - CHECK(resulting_sample_size != nullptr); +void ComputeSamplingRate(std::vector gradients, + const double sampling_fraction, + double *expected_sample_size, + double *resulting_sample_size) { + EXPECT_TRUE(expected_sample_size); + EXPECT_TRUE(resulting_sample_size); + *expected_sample_size = sampling_fraction * static_cast(gradients.size()); + double threshold = ArrayArgs::CalculateThresholdMVS(&gradients, 0, gradients.size(), *expected_sample_size); + *resulting_sample_size = ComputeExpectationOfMVS(gradients, threshold); } +template +std::vector GenerateRandomVector(std::mt19937_64 &rng, size_t size) { + std::uniform_real_distribution distribution(1., 2.0f); + std::vector result; + for (size_t i = 0; i < size; ++i) { + result.emplace_back(distribution(rng)); + } + return result; +} + TEST(SearchThresholdMVS, Basic) { using namespace LightGBM; std::vector gradients({0.5f, 5.0f, 1.0f, 2.0f, 2.0f}); @@ -42,18 +58,44 @@ TEST(SearchThresholdMVS, Basic) { EXPECT_DOUBLE_EQ(expected, resulting); } -TEST(ArrayArgs, Partition) { +TEST(SearchThresholdMVS, SameGradientValue) { using namespace LightGBM; + std::vector gradients; + + for (size_t i = 0; i < 10; ++i) { + gradients.emplace_back(1.); + } + + double expected, resulting; + ComputeSamplingRate(gradients, 0.5, &expected, &resulting); + EXPECT_DOUBLE_EQ(expected, resulting); + EXPECT_DOUBLE_EQ(resulting, 5.); +} + +TEST(SearchThresholdMVS, LargeTest) { + std::mt19937_64 rng(42); + const size_t number_of_iterations = 100; + for (size_t i = 0; i < number_of_iterations; ++i) { + std::vector grad = GenerateRandomVector(rng, 10000); + + double expected, resulting; + ComputeSamplingRate(std::move(grad), 0.01 + (0.98 * i) / number_of_iterations, &expected, &resulting); + EXPECT_NEAR(expected, resulting, 1e-3); + } +} + +TEST(ArrayArgs, Partition) { std::vector gradients({0.5f, 5.0f, 1.0f, 2.0f, 2.0f}); data_size_t middle_begin = -1, middle_end = gradients.size(); + ArrayArgs::Partition(&gradients, 0, gradients.size(), &middle_begin, &middle_end); + EXPECT_EQ(gradients[middle_begin + 1], gradients[middle_end - 1]); EXPECT_GT(gradients[0], gradients[middle_begin + 1]); EXPECT_GT(gradients[middle_begin + 1], gradients.back()); } TEST(SearchThresholdMVS, PartitionOneElement) { - using namespace LightGBM; std::vector gradients({0.5f}); data_size_t middle_begin = -1, middle_end = gradients.size(); ArrayArgs::Partition(&gradients, 0, gradients.size(), &middle_begin, &middle_end); From 468102b04b8b8bafc6c80c1d4e33bf6db7fcc311 Mon Sep 17 00:00:00 2001 From: kruda Date: Tue, 11 May 2021 00:00:28 +0300 Subject: [PATCH 22/26] Fixed linting errors --- tests/cpp_tests/test_mvs_threshold_search.cpp | 2 -- 1 file changed, 2 deletions(-) diff --git a/tests/cpp_tests/test_mvs_threshold_search.cpp b/tests/cpp_tests/test_mvs_threshold_search.cpp index 3d25e4c9d023..7a6463d303fe 100644 --- a/tests/cpp_tests/test_mvs_threshold_search.cpp +++ b/tests/cpp_tests/test_mvs_threshold_search.cpp @@ -51,7 +51,6 @@ std::vector GenerateRandomVector(std::mt19937_64 &rng, size_t size) { } TEST(SearchThresholdMVS, Basic) { - using namespace LightGBM; std::vector gradients({0.5f, 5.0f, 1.0f, 2.0f, 2.0f}); double expected, resulting; ComputeSamplingRate(gradients, 0.5, &expected, &resulting); @@ -59,7 +58,6 @@ TEST(SearchThresholdMVS, Basic) { } TEST(SearchThresholdMVS, SameGradientValue) { - using namespace LightGBM; std::vector gradients; for (size_t i = 0; i < 10; ++i) { From fd3f64a174cefb758e78f03a00fcd02d5164b446 Mon Sep 17 00:00:00 2001 From: kruda Date: Tue, 11 May 2021 18:50:48 +0300 Subject: [PATCH 23/26] Updated R-package documentation Updated documentation Updated test_mvs_threshold_search.cpp Added parallel computation of regularized absolute value term. Added new mvs parameter from constant. --- R-package/R/lgb.cv.R | 3 ++- R-package/R/lgb.train.R | 3 ++- R-package/R/lightgbm.R | 3 ++- R-package/man/lgb.cv.Rd | 3 ++- R-package/man/lgb.train.Rd | 3 ++- R-package/man/lightgbm.Rd | 3 ++- docs/Parameters.rst | 10 ++++++++ include/LightGBM/config.h | 8 ++++++ src/boosting/mvs.cpp | 25 +++++++++++-------- src/boosting/mvs.hpp | 2 -- src/io/config_auto.cpp | 5 ++++ tests/cpp_tests/test_mvs_threshold_search.cpp | 8 +++--- 12 files changed, 53 insertions(+), 23 deletions(-) diff --git a/R-package/R/lgb.cv.R b/R-package/R/lgb.cv.R index 74a4e4d64728..5d6b52f8b5bc 100644 --- a/R-package/R/lgb.cv.R +++ b/R-package/R/lgb.cv.R @@ -41,7 +41,8 @@ CVBooster <- R6::R6Class( #' into a predictor model which frees up memory and the original datasets #' @param ... other parameters, see Parameters.rst for more information. A few key parameters: #' \itemize{ -#' \item{\code{boosting}: Boosting type. \code{"gbdt"}, \code{"rf"}, \code{"dart"} or \code{"goss"}.} +#' \item{\code{boosting}: Boosting type. \code{"gbdt"}, \code{"rf"}, \code{"dart"}, \code{"goss"} +#' or \code{"mvs"}.} #' \item{\code{num_leaves}: Maximum number of leaves in one tree.} #' \item{\code{max_depth}: Limit the max depth for tree model. This is used to deal with #' overfit when #data is small. Tree still grow by leaf-wise.} diff --git a/R-package/R/lgb.train.R b/R-package/R/lgb.train.R index c47d6ce6901e..18bfffa53611 100644 --- a/R-package/R/lgb.train.R +++ b/R-package/R/lgb.train.R @@ -15,7 +15,8 @@ #' @param ... other parameters, see \href{https://lightgbm.readthedocs.io/en/latest/Parameters.html}{ #' the "Parameters" section of the documentation} for more information. A few key parameters: #' \itemize{ -#' \item{\code{boosting}: Boosting type. \code{"gbdt"}, \code{"rf"}, \code{"dart"} or \code{"goss"}.} +#' \item{\code{boosting}: Boosting type. \code{"gbdt"}, \code{"rf"}, \code{"dart"}, +#' \code{"goss"} or \code{"mvs"}.} #' \item{\code{num_leaves}: Maximum number of leaves in one tree.} #' \item{\code{max_depth}: Limit the max depth for tree model. This is used to deal with #' overfitting. Tree still grow by leaf-wise.} diff --git a/R-package/R/lightgbm.R b/R-package/R/lightgbm.R index e2df9063ed26..17169ec28272 100644 --- a/R-package/R/lightgbm.R +++ b/R-package/R/lightgbm.R @@ -90,7 +90,8 @@ NULL #' say "the first and tenth columns").} #' \item{\code{reset_data}: Boolean, setting it to TRUE (not the default value) will transform the booster model #' into a predictor model which frees up memory and the original datasets} -#' \item{\code{boosting}: Boosting type. \code{"gbdt"}, \code{"rf"}, \code{"dart"} or \code{"goss"}.} +#' \item{\code{boosting}: Boosting type. \code{"gbdt"}, \code{"rf"}, \code{"dart"}, +#' \code{"goss"} or \code{"mvs"}.} #' \item{\code{num_leaves}: Maximum number of leaves in one tree.} #' \item{\code{max_depth}: Limit the max depth for tree model. This is used to deal with #' overfit when #data is small. Tree still grow by leaf-wise.} diff --git a/R-package/man/lgb.cv.Rd b/R-package/man/lgb.cv.Rd index ec606d880ac6..9e5238eb7825 100644 --- a/R-package/man/lgb.cv.Rd +++ b/R-package/man/lgb.cv.Rd @@ -118,7 +118,8 @@ into a predictor model which frees up memory and the original datasets} \item{...}{other parameters, see Parameters.rst for more information. A few key parameters: \itemize{ - \item{\code{boosting}: Boosting type. \code{"gbdt"}, \code{"rf"}, \code{"dart"} or \code{"goss"}.} + \item{\code{boosting}: Boosting type. \code{"gbdt"}, \code{"rf"}, \code{"dart"}, \code{"goss"} + or \code{"mvs"}.} \item{\code{num_leaves}: Maximum number of leaves in one tree.} \item{\code{max_depth}: Limit the max depth for tree model. This is used to deal with overfit when #data is small. Tree still grow by leaf-wise.} diff --git a/R-package/man/lgb.train.Rd b/R-package/man/lgb.train.Rd index 40c7135d3b26..1239872d6099 100644 --- a/R-package/man/lgb.train.Rd +++ b/R-package/man/lgb.train.Rd @@ -102,7 +102,8 @@ original datasets} \item{...}{other parameters, see \href{https://lightgbm.readthedocs.io/en/latest/Parameters.html}{ the "Parameters" section of the documentation} for more information. A few key parameters: \itemize{ - \item{\code{boosting}: Boosting type. \code{"gbdt"}, \code{"rf"}, \code{"dart"} or \code{"goss"}.} + \item{\code{boosting}: Boosting type. \code{"gbdt"}, \code{"rf"}, \code{"dart"}, + \code{"goss"} or \code{"mvs"}.} \item{\code{num_leaves}: Maximum number of leaves in one tree.} \item{\code{max_depth}: Limit the max depth for tree model. This is used to deal with overfitting. Tree still grow by leaf-wise.} diff --git a/R-package/man/lightgbm.Rd b/R-package/man/lightgbm.Rd index 6512dbc6b23a..e175a453cc7c 100644 --- a/R-package/man/lightgbm.Rd +++ b/R-package/man/lightgbm.Rd @@ -63,7 +63,8 @@ set to the iteration number of the best iteration.} say "the first and tenth columns").} \item{\code{reset_data}: Boolean, setting it to TRUE (not the default value) will transform the booster model into a predictor model which frees up memory and the original datasets} - \item{\code{boosting}: Boosting type. \code{"gbdt"}, \code{"rf"}, \code{"dart"} or \code{"goss"}.} + \item{\code{boosting}: Boosting type. \code{"gbdt"}, \code{"rf"}, \code{"dart"}, + \code{"goss"} or \code{"mvs"}.} \item{\code{num_leaves}: Maximum number of leaves in one tree.} \item{\code{max_depth}: Limit the max depth for tree model. This is used to deal with overfit when #data is small. Tree still grow by leaf-wise.} diff --git a/docs/Parameters.rst b/docs/Parameters.rst index 43ecdd83d97b..e837a92ac51a 100644 --- a/docs/Parameters.rst +++ b/docs/Parameters.rst @@ -350,6 +350,16 @@ Learning Control Parameters - used only in ``mvs`` +- ``mvs_max_sequential_size`` :raw-html:`🔗︎`, default = ``256000``, type = int, constraints: ``mvs_max_sequential_size > 0`` + + - used in MVS boosting training dataset size is greater than ``mvs_max_sequential_size``, than threshold + + - for MVS is chosen for each thread independently. + + - used only in ``mvs`` + + - **Note**: on small dataset setting this parameter less than size of dataset may produce results depending on number of threads + - ``bagging_freq`` :raw-html:`🔗︎`, default = ``0``, type = int, aliases: ``subsample_freq`` - frequency for bagging diff --git a/include/LightGBM/config.h b/include/LightGBM/config.h index 19479c92fe88..d1a0db9a7b9b 100644 --- a/include/LightGBM/config.h +++ b/include/LightGBM/config.h @@ -330,6 +330,14 @@ struct Config { // desc = used only in ``mvs`` bool mvs_adaptive = false; + // default = 256000 + // check = >0 + // desc = used in MVS boosting training dataset size is greater than ``mvs_max_sequential_size``, than threshold + // desc = for MVS is chosen for each thread independently. + // desc = used only in ``mvs`` + // desc = **Note**: on small dataset setting this parameter less than size of dataset may produce results depending on number of threads + int mvs_max_sequential_size = 256000; + // alias = subsample_freq // desc = frequency for bagging // desc = ``0`` means disable bagging; ``k`` means perform bagging at every ``k`` iteration. Every ``k``-th iteration, LightGBM will randomly select ``bagging_fraction * 100 %`` of the data to use for the next ``k`` iterations diff --git a/src/boosting/mvs.cpp b/src/boosting/mvs.cpp index 5ecdacca1170..b5dbad319195 100644 --- a/src/boosting/mvs.cpp +++ b/src/boosting/mvs.cpp @@ -5,6 +5,8 @@ #include "mvs.hpp" +#include + namespace LightGBM { using ConstTreeIterator = std::vector>::const_iterator; @@ -69,7 +71,17 @@ void MVS::Bagging(int iter) { bag_data_cnt_ = num_data_; mvs_lambda_ = GetLambda(); - if (num_data_ <= kMaxSequentialSize) { + #pragma omp parallel for schedule(static, 1024) + for (data_size_t i = 0; i < num_data_; ++i) { + tmp_derivatives_[i] = 0.0f; + for (int cur_tree_id = 0; cur_tree_id < num_tree_per_iteration_; ++cur_tree_id) { + size_t idx = static_cast(cur_tree_id) * num_data_ + i; + tmp_derivatives_[i] += gradients_[idx] * gradients_[idx] + mvs_lambda_ * hessians_[idx] * hessians_[idx]; + } + tmp_derivatives_[i] = std::sqrt(tmp_derivatives_[i]); + } + + if (num_data_ <= config_->mvs_max_sequential_size) { threshold_ = GetThreshold(0, num_data_); } @@ -141,19 +153,10 @@ data_size_t MVS::BaggingHelper(data_size_t start, data_size_t cnt, data_size_t * double MVS::GetThreshold(data_size_t begin, data_size_t cnt) { data_size_t n_blocks, block_size; Threading::BlockInfoForceSize(num_data_, bagging_rand_block_, &n_blocks, &block_size); - if (num_data_ < kMaxSequentialSize && block_size > 1 && threshold_ != 0.0) { + if (num_data_ <= config_->mvs_max_sequential_size && block_size > 1 && threshold_ != 0.0) { return threshold_; } - for (data_size_t i = begin; i < begin + cnt; ++i) { - tmp_derivatives_[i] = 0.0f; - for (int cur_tree_id = 0; cur_tree_id < num_tree_per_iteration_; ++cur_tree_id) { - size_t idx = static_cast(cur_tree_id) * num_data_ + i; - tmp_derivatives_[i] += gradients_[idx] * gradients_[idx] + mvs_lambda_ * hessians_[idx] * hessians_[idx]; - } - tmp_derivatives_[i] = std::sqrt(tmp_derivatives_[i]); - } - double threshold = ArrayArgs::CalculateThresholdMVS(&tmp_derivatives_, begin, begin + cnt, cnt * config_->bagging_fraction); return threshold; diff --git a/src/boosting/mvs.hpp b/src/boosting/mvs.hpp index d3e60483fe45..02eeb34649a3 100644 --- a/src/boosting/mvs.hpp +++ b/src/boosting/mvs.hpp @@ -91,8 +91,6 @@ class MVS : public GBDT { double GetLambda(); - static const data_size_t kMaxSequentialSize = 256000; - double mvs_lambda_; double threshold_{0.0}; std::vector tmp_derivatives_; diff --git a/src/io/config_auto.cpp b/src/io/config_auto.cpp index f4fcabdf522d..f581f1f32a0c 100644 --- a/src/io/config_auto.cpp +++ b/src/io/config_auto.cpp @@ -198,6 +198,7 @@ const std::unordered_set& Config::parameter_set() { "neg_bagging_fraction", "mvs_lambda", "mvs_adaptive", + "mvs_max_sequential_size", "bagging_freq", "bagging_seed", "feature_fraction", @@ -364,6 +365,9 @@ void Config::GetMembersFromString(const std::unordered_map gradients, } template -std::vector GenerateRandomVector(std::mt19937_64 &rng, size_t size) { +std::vector GenerateRandomVector(std::mt19937_64 *rng, size_t size) { std::uniform_real_distribution distribution(1., 2.0f); std::vector result; for (size_t i = 0; i < size; ++i) { - result.emplace_back(distribution(rng)); + result.emplace_back(distribution(*rng)); } return result; } @@ -74,7 +74,7 @@ TEST(SearchThresholdMVS, LargeTest) { std::mt19937_64 rng(42); const size_t number_of_iterations = 100; for (size_t i = 0; i < number_of_iterations; ++i) { - std::vector grad = GenerateRandomVector(rng, 10000); + std::vector grad = GenerateRandomVector(&rng, 10000); double expected, resulting; ComputeSamplingRate(std::move(grad), 0.01 + (0.98 * i) / number_of_iterations, &expected, &resulting); @@ -93,7 +93,7 @@ TEST(ArrayArgs, Partition) { EXPECT_GT(gradients[middle_begin + 1], gradients.back()); } -TEST(SearchThresholdMVS, PartitionOneElement) { +TEST(ArrayArgs, PartitionOneElement) { std::vector gradients({0.5f}); data_size_t middle_begin = -1, middle_end = gradients.size(); ArrayArgs::Partition(&gradients, 0, gradients.size(), &middle_begin, &middle_end); From 11df7890724868ce0609758fb81062829e3f48a9 Mon Sep 17 00:00:00 2001 From: kruda Date: Fri, 14 May 2021 19:52:42 +0300 Subject: [PATCH 24/26] Updated MVS Lambda algorithm --- src/boosting/mvs.cpp | 1 - 1 file changed, 1 deletion(-) diff --git a/src/boosting/mvs.cpp b/src/boosting/mvs.cpp index b5dbad319195..02dc5c7f9015 100644 --- a/src/boosting/mvs.cpp +++ b/src/boosting/mvs.cpp @@ -54,7 +54,6 @@ double MVS::GetLambda() { } double lambda = (this->iter_ > 0) ? ComputeLeavesMeanSquaredValue(models_.cend() - num_tree_per_iteration_, models_.cend()) - / config_->learning_rate : ComputeMeanGradValues(gradients_.data(), hessians_.data(), num_data_, From ddcab83422a9d21ce6393a968449a466843d52f4 Mon Sep 17 00:00:00 2001 From: kruda Date: Tue, 6 Jul 2021 15:31:50 +0300 Subject: [PATCH 25/26] Updated documentation, MVS::GetLambda, MVS::GetThreshold, updated MVS::ResetConfig --- docs/Parameters.rst | 24 ++++++++++++++++++++++++ include/LightGBM/config.h | 4 ++-- src/boosting/mvs.cpp | 35 +++++++++++++++-------------------- src/boosting/mvs.hpp | 4 +++- 4 files changed, 44 insertions(+), 23 deletions(-) diff --git a/docs/Parameters.rst b/docs/Parameters.rst index c54bf96a6f8e..8050f193e9b7 100644 --- a/docs/Parameters.rst +++ b/docs/Parameters.rst @@ -139,6 +139,8 @@ Core Parameters - **Note**: internally, LightGBM uses ``gbdt`` mode for the first ``1 / learning_rate`` iterations + - ``mvs``, Minimal variance sampling __ + - ``data`` :raw-html:`🔗︎`, default = ``""``, type = string, aliases: ``train``, ``train_data``, ``train_data_file``, ``data_filename`` - path of training data, LightGBM will train from this data @@ -334,6 +336,28 @@ Learning Control Parameters - **Note**: if balanced bagging is enabled, ``bagging_fraction`` will be ignored +- ``mvs_lambda`` :raw-html:`🔗︎`, default = ``1e-4``, type = double, constraints: ``mvs_lambda > 0.0`` + + - used in MVS boosting. If ``mvs_adaptive == true`` then this value is ignored. + + - used only in ``mvs`` + +- ``mvs_adaptive`` :raw-html:`🔗︎`, default = ``false``, type = bool + + - use adaptive variant of mvs boosting + + - used only in ``mvs`` + +- ``mvs_max_sequential_size`` :raw-html:`🔗︎`, default = ``256000``, type = int, constraints: ``mvs_max_sequential_size > 0`` + + - used in MVS boosting training. If dataset size is greater than ``mvs_max_sequential_size``, then threshold + + - for MVS is chosen for each thread independently. + + - used only in ``mvs`` + + - **Note**: on small dataset setting this parameter less than size of dataset may produce results depending on number of threads + - ``bagging_freq`` :raw-html:`🔗︎`, default = ``0``, type = int, aliases: ``subsample_freq`` - frequency for bagging diff --git a/include/LightGBM/config.h b/include/LightGBM/config.h index 286250ddd691..bb9a2c251591 100644 --- a/include/LightGBM/config.h +++ b/include/LightGBM/config.h @@ -308,7 +308,7 @@ struct Config { // default = 1e-4 // check = >0.0 - // desc = used in MVS boosting if ``mvs_adaptive == true`` than this value is ignored + // desc = used in MVS boosting. If ``mvs_adaptive == true`` then this value is ignored. // desc = used only in ``mvs`` double mvs_lambda = 1e-4; @@ -319,7 +319,7 @@ struct Config { // default = 256000 // check = >0 - // desc = used in MVS boosting training dataset size is greater than ``mvs_max_sequential_size``, than threshold + // desc = used in MVS boosting training. If dataset size is greater than ``mvs_max_sequential_size``, then threshold // desc = for MVS is chosen for each thread independently. // desc = used only in ``mvs`` // desc = **Note**: on small dataset setting this parameter less than size of dataset may produce results depending on number of threads diff --git a/src/boosting/mvs.cpp b/src/boosting/mvs.cpp index 02dc5c7f9015..c3ba89f215ba 100644 --- a/src/boosting/mvs.cpp +++ b/src/boosting/mvs.cpp @@ -13,10 +13,11 @@ using ConstTreeIterator = std::vector>::const_iterator; MVS::MVS() : GBDT() {} -static double ComputeLeavesMeanSquaredValue(ConstTreeIterator begin, ConstTreeIterator end) { +static double ComputeLeavesMeanSquaredValue(ConstTreeIterator begin, + ConstTreeIterator end, + const data_size_t num_leaves) { double sum_values = 0.0; - data_size_t num_leaves = (*begin)->num_leaves(); -#pragma omp parallel for schedule(static, 2048) reduction(+:sum_values) +#pragma omp parallel for schedule(static, 2048) reduction(+ : sum_values) for (data_size_t leaf_idx = 0; leaf_idx < num_leaves; ++leaf_idx) { double leave_value = 0.0; for (ConstTreeIterator it = begin; it != end; ++it) { @@ -30,12 +31,11 @@ static double ComputeLeavesMeanSquaredValue(ConstTreeIterator begin, ConstTreeIt return sum_values / num_leaves; } -static double ComputeMeanGradValues(score_t *gradients, - score_t *hessians, +static double ComputeMeanGradValues(score_t *gradients, score_t *hessians, data_size_t size, data_size_t num_tree_per_iteration) { double sum = 0.0; -#pragma omp parallel for schedule(static, 1024) reduction(+:sum) +#pragma omp parallel for schedule(static, 1024) reduction(+ : sum) for (data_size_t i = 0; i < size; ++i) { double local_hessians = 0.0, local_gradients = 0.0; for (data_size_t j = 0; j < num_tree_per_iteration; ++j) { @@ -52,25 +52,23 @@ double MVS::GetLambda() { if (!mvs_adaptive_) { return mvs_lambda_; } - double lambda = - (this->iter_ > 0) ? ComputeLeavesMeanSquaredValue(models_.cend() - num_tree_per_iteration_, models_.cend()) - : ComputeMeanGradValues(gradients_.data(), - hessians_.data(), - num_data_, - num_tree_per_iteration_); - - return lambda; + if (this->iter_ > 0) { + return ComputeLeavesMeanSquaredValue(models_.cend() - num_tree_per_iteration_, + models_.cend(), config_->num_leaves); + } + return ComputeMeanGradValues(gradients_.data(), hessians_.data(), num_data_, + num_tree_per_iteration_); } void MVS::Bagging(int iter) { if (iter % config_->bagging_freq != 0 && !need_re_bagging_) { return; } - + need_re_bagging_ = false; bag_data_cnt_ = num_data_; mvs_lambda_ = GetLambda(); - #pragma omp parallel for schedule(static, 1024) + //#pragma omp parallel for schedule(static, 1024) for (data_size_t i = 0; i < num_data_; ++i) { tmp_derivatives_[i] = 0.0f; for (int cur_tree_id = 0; cur_tree_id < num_tree_per_iteration_; ++cur_tree_id) { @@ -150,9 +148,7 @@ data_size_t MVS::BaggingHelper(data_size_t start, data_size_t cnt, data_size_t * } double MVS::GetThreshold(data_size_t begin, data_size_t cnt) { - data_size_t n_blocks, block_size; - Threading::BlockInfoForceSize(num_data_, bagging_rand_block_, &n_blocks, &block_size); - if (num_data_ <= config_->mvs_max_sequential_size && block_size > 1 && threshold_ != 0.0) { + if (num_data_ <= config_->mvs_max_sequential_size && threshold_ != 0.0) { return threshold_; } @@ -165,7 +161,6 @@ void MVS::ResetMVS() { CHECK(config_->bagging_fraction > 0.0f && config_->bagging_fraction < 1.0f && config_->bagging_freq > 0); CHECK(config_->mvs_lambda >= 0.0f); CHECK(!balanced_bagging_); - bag_data_indices_.resize(num_data_); tmp_derivatives_.resize(num_data_); Log::Info("Using MVS"); diff --git a/src/boosting/mvs.hpp b/src/boosting/mvs.hpp index 02eeb34649a3..13f081fbf016 100644 --- a/src/boosting/mvs.hpp +++ b/src/boosting/mvs.hpp @@ -51,6 +51,8 @@ class MVS : public GBDT { void ResetConfig(const Config *config) override { GBDT::ResetConfig(config); + need_re_bagging_ = mvs_adaptive_ != config->mvs_adaptive + || (mvs_lambda_ != config->mvs_lambda && !mvs_adaptive_ && !config->mvs_adaptive); mvs_lambda_ = config_->mvs_lambda; mvs_adaptive_ = config_->mvs_adaptive; ResetMVS(); @@ -63,7 +65,7 @@ class MVS : public GBDT { // use customized objective function CHECK(hessians != nullptr && objective_function_ == nullptr); int64_t total_size = static_cast(num_data_) * num_tree_per_iteration_; - #pragma omp parallel for schedule(static, 1) + #pragma omp parallel for schedule(static, 1024) for (int64_t i = 0; i < total_size; ++i) { gradients_[i] = gradients[i]; hessians_[i] = hessians[i]; From 31ab4d42312a6973b4b8620317ef17b050d7eb50 Mon Sep 17 00:00:00 2001 From: Nikita Titov Date: Fri, 11 Mar 2022 16:46:09 +0300 Subject: [PATCH 26/26] [ci] fix current `master` fails with graphviz-related error (#5068) * Update test_windows.ps1 * Update .appveyor.yml * Update test_windows.ps1 * Update .appveyor.yml --- .ci/test_windows.ps1 | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/.ci/test_windows.ps1 b/.ci/test_windows.ps1 index 1273d3713350..d4c5012a1b87 100644 --- a/.ci/test_windows.ps1 +++ b/.ci/test_windows.ps1 @@ -52,7 +52,7 @@ if ($env:TASK -eq "swig") { conda install -q -y -n $env:CONDA_ENV joblib matplotlib numpy pandas psutil pytest scikit-learn scipy ; Check-Output $? # python-graphviz has to be installed separately to prevent conda from downgrading to pypy -conda install -q -y -n $env:CONDA_ENV python-graphviz ; Check-Output $? +conda install -q -y -n $env:CONDA_ENV libxml2 python-graphviz ; Check-Output $? if ($env:TASK -eq "regular") { mkdir $env:BUILD_SOURCESDIRECTORY/build; cd $env:BUILD_SOURCESDIRECTORY/build