blob: 895f94f868193f79a856dd843c8eb8521a32589b [file] [log] [blame]
// Copyright 2018 The Chromium Authors. All rights reserved.
// Use of this source code is governed by a BSD-style license that can be
// found in the LICENSE file.
#include "media/learning/impl/learning_task_controller_impl.h"
#include <memory>
#include <utility>
#include <vector>
#include "base/bind.h"
#include "base/check_op.h"
#include "base/notreached.h"
#include "media/learning/impl/distribution_reporter.h"
#include "media/learning/impl/extra_trees_trainer.h"
#include "media/learning/impl/lookup_table_trainer.h"
namespace media {
namespace learning {
LearningTaskControllerImpl::LearningTaskControllerImpl(
const LearningTask& task,
std::unique_ptr<DistributionReporter> reporter,
SequenceBoundFeatureProvider feature_provider)
: task_(task),
training_data_(std::make_unique<TrainingData>()),
reporter_(std::move(reporter)),
helper_(std::make_unique<LearningTaskControllerHelper>(
task,
base::BindRepeating(&LearningTaskControllerImpl::AddFinishedExample,
AsWeakPtr()),
std::move(feature_provider))),
expected_feature_count_(task_.feature_descriptions.size()) {
// Note that |helper_| uses the full set of features.
// TODO(liberato): Make this compositional. FeatureSubsetTaskController?
if (task_.feature_subset_size)
DoFeatureSubsetSelection();
switch (task_.model) {
case LearningTask::Model::kExtraTrees:
trainer_ = std::make_unique<ExtraTreesTrainer>();
break;
case LearningTask::Model::kLookupTable:
trainer_ = std::make_unique<LookupTableTrainer>();
break;
}
}
LearningTaskControllerImpl::~LearningTaskControllerImpl() = default;
void LearningTaskControllerImpl::BeginObservation(
base::UnguessableToken id,
const FeatureVector& features,
const absl::optional<TargetValue>& default_target,
const absl::optional<ukm::SourceId>& source_id) {
// TODO(liberato): Should we enforce that the right number of features are
// present here? Right now, we allow it to be shorter, so that features from
// a FeatureProvider may be omitted. Of course, they have to be at the end in
// that case. If we start enforcing it here, make sure that LearningHelper
// starts adding the placeholder features.
if (!trainer_)
return;
// We don't support default targets, since we're the base learner and can't
// easily do that. However, defaults are handled by (weak) controllers
// handed out by LearningSessionImpl. So, we don't bother since they never
// get here anyway.
DCHECK(!default_target);
helper_->BeginObservation(id, features, source_id);
}
void LearningTaskControllerImpl::CompleteObservation(
base::UnguessableToken id,
const ObservationCompletion& completion) {
if (!trainer_)
return;
helper_->CompleteObservation(id, completion);
}
void LearningTaskControllerImpl::CancelObservation(base::UnguessableToken id) {
if (!trainer_)
return;
helper_->CancelObservation(id);
}
void LearningTaskControllerImpl::UpdateDefaultTarget(
base::UnguessableToken id,
const absl::optional<TargetValue>& default_target) {
NOTREACHED();
}
const LearningTask& LearningTaskControllerImpl::GetLearningTask() {
return task_;
}
void LearningTaskControllerImpl::PredictDistribution(
const FeatureVector& features,
PredictionCB callback) {
if (model_)
std::move(callback).Run(model_->PredictDistribution(features));
else
std::move(callback).Run(absl::nullopt);
}
void LearningTaskControllerImpl::AddFinishedExample(LabelledExample example,
ukm::SourceId source_id) {
// Verify that we have a trainer and that we got the right number of features.
// We don't compare to |task_.feature_descriptions.size()| since that has been
// adjusted to the subset size already. We expect the original count.
if (!trainer_ || example.features.size() != expected_feature_count_)
return;
// Now that we have the whole set of features, select the subset we want.
FeatureVector new_features;
if (task_.feature_subset_size) {
for (auto& iter : feature_indices_)
new_features.push_back(example.features[iter]);
example.features = std::move(new_features);
} // else use them all.
// The features should now match the task.
DCHECK_EQ(example.features.size(), task_.feature_descriptions.size());
if (training_data_->size() >= task_.max_data_set_size) {
// Replace a random example. We don't necessarily want to replace the
// oldest, since we don't necessarily want to enforce an ad-hoc recency
// constraint here. That's a different issue.
(*training_data_)[rng()->Generate(training_data_->size())] = example;
} else {
training_data_->push_back(example);
}
// Either way, we have one more example that we haven't used for training yet.
num_untrained_examples_++;
// Once we have a model, see if we'd get |example| correct.
if (model_ && reporter_) {
TargetHistogram predicted = model_->PredictDistribution(example.features);
DistributionReporter::PredictionInfo info;
info.observed = example.target_value;
info.source_id = source_id;
info.total_training_weight = last_training_weight_;
info.total_training_examples = last_training_size_;
reporter_->GetPredictionCallback(info).Run(predicted);
}
// Can't train more than one model concurrently.
if (training_is_in_progress_)
return;
// Train every time we get enough new examples. Note that this works even if
// we are replacing old examples rather than adding new ones.
double frac = ((double)num_untrained_examples_) / training_data_->size();
if (frac < task_.min_new_data_fraction)
return;
num_untrained_examples_ = 0;
// Record these for metrics.
last_training_weight_ = training_data_->total_weight();
last_training_size_ = training_data_->size();
TrainedModelCB model_cb =
base::BindOnce(&LearningTaskControllerImpl::OnModelTrained, AsWeakPtr(),
training_data_->total_weight(), training_data_->size());
training_is_in_progress_ = true;
// Note that this copies the training data, so it's okay if we add more
// examples to our copy before this returns.
// TODO(liberato): Post to a background task runner, and bind |model_cb| to
// the current one. Be careful about ownership if we invalidate |trainer_|
// on this thread. Be sure to post destruction to that sequence.
trainer_->Train(task_, *training_data_, std::move(model_cb));
}
void LearningTaskControllerImpl::OnModelTrained(double training_weight,
int training_size,
std::unique_ptr<Model> model) {
DCHECK(training_is_in_progress_);
training_is_in_progress_ = false;
model_ = std::move(model);
// Record these for metrics.
last_training_weight_ = training_weight;
last_training_size_ = training_size;
}
void LearningTaskControllerImpl::SetTrainerForTesting(
std::unique_ptr<TrainingAlgorithm> trainer) {
trainer_ = std::move(trainer);
}
void LearningTaskControllerImpl::DoFeatureSubsetSelection() {
// Choose a random feature, and trim the descriptions to match.
std::vector<size_t> features;
for (size_t i = 0; i < task_.feature_descriptions.size(); i++)
features.push_back(i);
for (int i = 0; i < *task_.feature_subset_size; i++) {
// Pick an element from |i| to the end of the list, inclusive.
// TODO(liberato): For tests, this will happen before any rng is provided
// by the test; we'll use an actual rng.
int r = rng()->Generate(features.size() - i) + i;
// Swap them.
std::swap(features[i], features[r]);
}
// Construct the feature subset from the first few elements. Also adjust the
// task's descriptions to match. We do this in two steps so that the
// descriptions are added via iterating over |feature_indices_|, so that the
// enumeration order is the same as when we adjust the feature values of
// incoming examples. In both cases, we iterate over |feature_indicies_|,
// which might (will) re-order them with respect to |features|.
for (int i = 0; i < *task_.feature_subset_size; i++)
feature_indices_.insert(features[i]);
std::vector<LearningTask::ValueDescription> adjusted_descriptions;
for (auto& iter : feature_indices_)
adjusted_descriptions.push_back(task_.feature_descriptions[iter]);
task_.feature_descriptions = adjusted_descriptions;
if (reporter_)
reporter_->SetFeatureSubset(feature_indices_);
}
} // namespace learning
} // namespace media