Skip to content

Commit

Permalink
[MM-41914] Added cluster level metrics (#788)
Browse files Browse the repository at this point in the history
  • Loading branch information
mirshahriar authored Dec 19, 2022
1 parent 68a4033 commit 5a55528
Show file tree
Hide file tree
Showing 4 changed files with 125 additions and 2 deletions.
2 changes: 1 addition & 1 deletion cmd/cloud/server.go
Original file line number Diff line number Diff line change
Expand Up @@ -354,7 +354,7 @@ func executeServerCmd(flags serverFlags) error {

var multiDoer supervisor.MultiDoer
if supervisorsEnabled.clusterSupervisor {
multiDoer = append(multiDoer, supervisor.NewClusterSupervisor(sqlStore, clusterProvisioner, awsClient, eventsProducer, instanceID, logger))
multiDoer = append(multiDoer, supervisor.NewClusterSupervisor(sqlStore, clusterProvisioner, awsClient, eventsProducer, instanceID, logger, cloudMetrics))
}
if supervisorsEnabled.groupSupervisor {
multiDoer = append(multiDoer, supervisor.NewGroupSupervisor(sqlStore, eventsProducer, instanceID, logger))
Expand Down
57 changes: 57 additions & 0 deletions internal/metrics/metrics.go
Original file line number Diff line number Diff line change
Expand Up @@ -34,6 +34,13 @@ type CloudMetrics struct {
// ClusterInstallation
ClusterInstallationReconcilingDurationHist *prometheus.HistogramVec
ClusterInstallationDeletionDurationHist *prometheus.HistogramVec

// Cluster
ClusterCreationDurationHist *prometheus.HistogramVec
ClusterUpgradeDurationHist *prometheus.HistogramVec
ClusterProvisioningDurationHist *prometheus.HistogramVec
ClusterResizeDurationHist *prometheus.HistogramVec
ClusterDeletionDurationHist *prometheus.HistogramVec
}

// New creates a new Prometheus-based Metrics object to be used
Expand Down Expand Up @@ -134,6 +141,56 @@ func New() *CloudMetrics {
},
[]string{"cluster"},
),
ClusterCreationDurationHist: promauto.NewHistogramVec(
prometheus.HistogramOpts{
Namespace: provisionerNamespace,
Subsystem: provisionerSubsystemApp,
Name: "cluster_creation_duration_seconds",
Help: "The duration of cluster creation tasks",
Buckets: standardDurationBuckets(),
},
[]string{},
),
ClusterUpgradeDurationHist: promauto.NewHistogramVec(
prometheus.HistogramOpts{
Namespace: provisionerNamespace,
Subsystem: provisionerSubsystemApp,
Name: "cluster_upgrade_duration_seconds",
Help: "The duration of cluster upgrade tasks",
Buckets: standardDurationBuckets(),
},
[]string{},
),
ClusterProvisioningDurationHist: promauto.NewHistogramVec(
prometheus.HistogramOpts{
Namespace: provisionerNamespace,
Subsystem: provisionerSubsystemApp,
Name: "cluster_provisioning_duration_seconds",
Help: "The duration of cluster provisioning tasks",
Buckets: standardDurationBuckets(),
},
[]string{},
),
ClusterResizeDurationHist: promauto.NewHistogramVec(
prometheus.HistogramOpts{
Namespace: provisionerNamespace,
Subsystem: provisionerSubsystemApp,
Name: "cluster_resize_duration_seconds",
Help: "The duration of cluster resize tasks",
Buckets: standardDurationBuckets(),
},
[]string{},
),
ClusterDeletionDurationHist: promauto.NewHistogramVec(
prometheus.HistogramOpts{
Namespace: provisionerNamespace,
Subsystem: provisionerSubsystemApp,
Name: "cluster_deletion_duration_seconds",
Help: "The duration of cluster deletion tasks",
Buckets: standardDurationBuckets(),
},
[]string{},
),
}
}

Expand Down
59 changes: 58 additions & 1 deletion internal/supervisor/cluster.go
Original file line number Diff line number Diff line change
Expand Up @@ -7,8 +7,10 @@ package supervisor
import (
"sort"

"github.com/mattermost/mattermost-cloud/internal/metrics"
"github.com/mattermost/mattermost-cloud/internal/tools/aws"
"github.com/mattermost/mattermost-cloud/model"
"github.com/pkg/errors"
log "github.com/sirupsen/logrus"
)

Expand All @@ -23,6 +25,8 @@ type clusterStore interface {
DeleteCluster(clusterID string) error

GetWebhooks(filter *model.WebhookFilter) ([]*model.Webhook, error)

GetStateChangeEvents(filter *model.StateChangeEventFilter) ([]*model.StateChangeEventData, error)
}

// clusterProvisioner abstracts the provisioning operations required by the cluster supervisor.
Expand All @@ -47,17 +51,19 @@ type ClusterSupervisor struct {
aws aws.AWS
eventsProducer eventProducer
instanceID string
metrics *metrics.CloudMetrics
logger log.FieldLogger
}

// NewClusterSupervisor creates a new ClusterSupervisor.
func NewClusterSupervisor(store clusterStore, clusterProvisioner clusterProvisioner, aws aws.AWS, eventProducer eventProducer, instanceID string, logger log.FieldLogger) *ClusterSupervisor {
func NewClusterSupervisor(store clusterStore, clusterProvisioner clusterProvisioner, aws aws.AWS, eventProducer eventProducer, instanceID string, logger log.FieldLogger, metrics *metrics.CloudMetrics) *ClusterSupervisor {
return &ClusterSupervisor{
store: store,
provisioner: clusterProvisioner,
aws: aws,
eventsProducer: eventProducer,
instanceID: instanceID,
metrics: metrics,
logger: logger,
}
}
Expand Down Expand Up @@ -136,6 +142,11 @@ func (s *ClusterSupervisor) Supervise(cluster *model.Cluster) {
return
}

err = s.processClusterMetrics(cluster, logger)
if err != nil {
logger.WithError(err).Error("Failed to process cluster metrics")
}

err = s.eventsProducer.ProduceClusterStateChangeEvent(cluster, oldState)
if err != nil {
logger.WithError(err).Error("Failed to create cluster state change event")
Expand Down Expand Up @@ -283,3 +294,49 @@ func (s *ClusterSupervisor) checkClusterCreated(cluster *model.Cluster, logger l

return s.provisionCluster(cluster, logger)
}

func (s *ClusterSupervisor) processClusterMetrics(cluster *model.Cluster, logger log.FieldLogger) error {

if cluster.State != model.ClusterStateStable && cluster.State != model.ClusterStateDeleted {
return nil
}

// Get the latest event of a 'requested' type to emit the correct metrics.
events, err := s.store.GetStateChangeEvents(&model.StateChangeEventFilter{
ResourceID: cluster.ID,
ResourceType: model.TypeCluster,
NewStates: model.AllClusterRequestStates,
Paging: model.Paging{Page: 0, PerPage: 1, IncludeDeleted: false},
})
if err != nil {
return errors.Wrap(err, "failed to get state change events")
}
if len(events) != 1 {
return errors.Errorf("expected 1 state change event, but got %d", len(events))
}

event := events[0]
elapsedSeconds := model.ElapsedTimeInSeconds(event.Event.Timestamp)

switch event.StateChange.NewState {
case model.ClusterStateCreationRequested:
s.metrics.ClusterCreationDurationHist.WithLabelValues().Observe(elapsedSeconds)
logger.Debugf("Cluster was created in %d seconds", int(elapsedSeconds))
case model.ClusterStateUpgradeRequested:
s.metrics.ClusterUpgradeDurationHist.WithLabelValues().Observe(elapsedSeconds)
logger.Debugf("Cluster was upgraded in %d seconds", int(elapsedSeconds))
case model.ClusterStateProvisioningRequested:
s.metrics.ClusterProvisioningDurationHist.WithLabelValues().Observe(elapsedSeconds)
logger.Debugf("Cluster was provisioned in %d seconds", int(elapsedSeconds))
case model.ClusterStateResizeRequested:
s.metrics.ClusterResizeDurationHist.WithLabelValues().Observe(elapsedSeconds)
logger.Debugf("Cluster was resized in %d seconds", int(elapsedSeconds))
case model.ClusterStateDeletionRequested:
s.metrics.ClusterDeletionDurationHist.WithLabelValues().Observe(elapsedSeconds)
logger.Debugf("Cluster was deleted in %d seconds", int(elapsedSeconds))
default:
return errors.Errorf("failed to handle event %s with new state %s", event.Event.ID, event.StateChange.NewState)
}

return nil
}
9 changes: 9 additions & 0 deletions internal/supervisor/cluster_test.go
Original file line number Diff line number Diff line change
Expand Up @@ -71,6 +71,10 @@ func (s *mockClusterStore) GetWebhooks(filter *model.WebhookFilter) ([]*model.We
return nil, nil
}

func (s *mockClusterStore) GetStateChangeEvents(filter *model.StateChangeEventFilter) ([]*model.StateChangeEventData, error) {
return nil, nil
}

type mockClusterProvisioner struct{}

func (p *mockClusterProvisioner) PrepareCluster(cluster *model.Cluster) bool {
Expand Down Expand Up @@ -117,6 +121,7 @@ func TestClusterSupervisorDo(t *testing.T) {
&mockEventProducer{},
"instanceID",
logger,
cloudMetrics,
)
err := supervisor.Do()
require.NoError(t, err)
Expand All @@ -142,6 +147,7 @@ func TestClusterSupervisorDo(t *testing.T) {
&mockEventProducer{},
"instanceID",
logger,
cloudMetrics,
)
err := supervisor.Do()
require.NoError(t, err)
Expand Down Expand Up @@ -193,6 +199,7 @@ func TestClusterSupervisorDo(t *testing.T) {
mockEventProducer,
"instanceID",
logger,
cloudMetrics,
)
err := supervisor.Do()
require.NoError(t, err)
Expand Down Expand Up @@ -230,6 +237,7 @@ func TestClusterSupervisorSupervise(t *testing.T) {
testutil.SetupTestEventsProducer(sqlStore, logger),
"instanceID",
logger,
cloudMetrics,
)

cluster := &model.Cluster{
Expand Down Expand Up @@ -259,6 +267,7 @@ func TestClusterSupervisorSupervise(t *testing.T) {
testutil.SetupTestEventsProducer(sqlStore, logger),
"instanceID",
logger,
cloudMetrics,
)

cluster := &model.Cluster{
Expand Down

0 comments on commit 5a55528

Please sign in to comment.