From c9c07b0b1d79d3df4501b27889c6f9ae21598548 Mon Sep 17 00:00:00 2001 From: Omar Abdulaziz Date: Mon, 27 May 2024 18:09:07 +0300 Subject: [PATCH 01/10] add healthiness endpoint --- grid-proxy/internal/explorer/db/postgres.go | 37 +++++++++ grid-proxy/internal/explorer/db/types.go | 3 + grid-proxy/internal/explorer/server.go | 9 +++ grid-proxy/internal/explorer/state.go | 87 +++++++++++++++++++++ grid-proxy/pkg/types/version.go | 7 ++ 5 files changed, 143 insertions(+) create mode 100644 grid-proxy/internal/explorer/state.go diff --git a/grid-proxy/internal/explorer/db/postgres.go b/grid-proxy/internal/explorer/db/postgres.go index bc722081a..82698c168 100644 --- a/grid-proxy/internal/explorer/db/postgres.go +++ b/grid-proxy/internal/explorer/db/postgres.go @@ -76,6 +76,35 @@ func (d *PostgresDatabase) Close() error { return db.Close() } +func (d *PostgresDatabase) Ping() error { + db, err := d.gormDB.DB() + if err != nil { + return fmt.Errorf("failed to get db connection") + } + + if err := db.Ping(); err != nil { + return fmt.Errorf("failed to ping db") + } + + return nil +} + +func (d *PostgresDatabase) Initialized() error { + db, err := d.gormDB.DB() + if err != nil { + return fmt.Errorf("failed to get db connection") + } + + initTables := []string{"node_gpu", "resources_cache"} + for _, tableName := range initTables { + if _, err := db.Query("select * from " + tableName + ";"); err != nil { + return err + } + } + + return nil +} + func (d *PostgresDatabase) Initialize() error { err := d.gormDB.AutoMigrate( &types.NodeGPU{}, @@ -911,3 +940,11 @@ func (d *PostgresDatabase) GetContractBills(ctx context.Context, contractID uint return bills, uint(count), nil } + +func (d *PostgresDatabase) GetRandomHealthyTwinIds(length int) ([]uint32, error) { + var ids []uint32 + if err := d.gormDB.Table("health_report").Select("node_twin_id").Where("healthy = true").Order("random()").Limit(int(length)).Scan(&ids).Error; err != nil { + return []uint32{}, err + } + return ids, nil +} diff --git a/grid-proxy/internal/explorer/db/types.go b/grid-proxy/internal/explorer/db/types.go index 4cd1984bf..19ed50f1a 100644 --- a/grid-proxy/internal/explorer/db/types.go +++ b/grid-proxy/internal/explorer/db/types.go @@ -9,6 +9,9 @@ import ( // Database interface for storing and fetching grid info type Database interface { GetConnectionString() string + Ping() error + Initialized() error + GetRandomHealthyTwinIds(length int) ([]uint32, error) // server getters GetStats(ctx context.Context, filter types.StatsFilter) (types.Stats, error) diff --git a/grid-proxy/internal/explorer/server.go b/grid-proxy/internal/explorer/server.go index 9d725ede9..03f64c8bc 100644 --- a/grid-proxy/internal/explorer/server.go +++ b/grid-proxy/internal/explorer/server.go @@ -410,6 +410,14 @@ func (a *App) version(r *http.Request) (interface{}, mw.Response) { }, response } +func (a *App) healthiness(r *http.Request) (interface{}, mw.Response) { + response := mw.Ok() + return createReport( + a.cl, + a.relayClient, + ), response +} + // getNodeStatistics godoc // @Summary Show node statistics // @Description Get node statistics for more information about each node through the RMB relay @@ -569,6 +577,7 @@ func Setup(router *mux.Router, gitCommit string, cl DBClient, relayClient rmb.Cl router.HandleFunc("/", mw.AsHandlerFunc(a.indexPage(router))) router.HandleFunc("/ping", mw.AsHandlerFunc(a.ping)) router.HandleFunc("/version", mw.AsHandlerFunc(a.version)) + router.HandleFunc("/healthiness", mw.AsHandlerFunc(a.healthiness)) router.PathPrefix("/swagger/").Handler(httpSwagger.WrapHandler) return nil diff --git a/grid-proxy/internal/explorer/state.go b/grid-proxy/internal/explorer/state.go new file mode 100644 index 000000000..63c25b9eb --- /dev/null +++ b/grid-proxy/internal/explorer/state.go @@ -0,0 +1,87 @@ +package explorer + +import ( + "context" + "fmt" + "sync" + "time" + + "github.com/threefoldtech/tfgrid-sdk-go/grid-proxy/pkg/types" + "github.com/threefoldtech/tfgrid-sdk-go/rmb-sdk-go" +) + +const ( + OkState = "ok" +) + +func createReport(db DBClient, peer rmb.Client) types.Healthiness { + var report types.Healthiness + + // db connection + report.DBConn = OkState + err := db.DB.Ping() + if err != nil { + report.DBConn = err.Error() + } + err = db.DB.Initialized() + if err != nil { + report.DBConn = err.Error() + } + + // rmb connection + report.RMBConn = OkState + if err := pingRandomTwins(db, peer); err != nil { + report.RMBConn = err.Error() + } + + // total + report.TotalStateOk = true + if report.DBConn != OkState || + report.RMBConn != OkState { + report.TotalStateOk = false + } + return report +} + +func pingRandomTwins(db DBClient, peer rmb.Client) error { + twinIds, err := db.DB.GetRandomHealthyTwinIds(10) + if err != nil { + return err + } + + ctx, cancel := context.WithTimeout(context.Background(), 10*time.Second) + defer cancel() + + var wg sync.WaitGroup + successCh := make(chan bool) + + for _, twinId := range twinIds { + wg.Add(1) + go func(twinId uint32) { + defer wg.Done() + + callCtx, callCancel := context.WithTimeout(ctx, 10*time.Second) + defer callCancel() + + var res interface{} + if err := peer.Call(callCtx, twinId, "zos.system.version", nil, &res); err == nil { + select { + case successCh <- true: + case <-ctx.Done(): + } + } + }(twinId) + } + + go func() { + wg.Wait() + close(successCh) + }() + + select { + case <-successCh: + return nil + case <-ctx.Done(): + return fmt.Errorf("failed to call twins: %+v", twinIds) + } +} diff --git a/grid-proxy/pkg/types/version.go b/grid-proxy/pkg/types/version.go index f522bb320..96b92cd8d 100644 --- a/grid-proxy/pkg/types/version.go +++ b/grid-proxy/pkg/types/version.go @@ -4,3 +4,10 @@ package types type Version struct { Version string `json:"version"` } + +// Healthiness represent the healthiness of the server and connections +type Healthiness struct { + TotalStateOk bool `json:"total_state_ok"` + DBConn string `json:"db_conn"` + RMBConn string `json:"rmb_conn"` +} From c73a8974af3aee1223f9f098b1e023d346ae56b6 Mon Sep 17 00:00:00 2001 From: Omar Abdulaziz Date: Tue, 28 May 2024 12:29:44 +0300 Subject: [PATCH 02/10] remove unnecessary conversion --- grid-proxy/internal/explorer/db/postgres.go | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/grid-proxy/internal/explorer/db/postgres.go b/grid-proxy/internal/explorer/db/postgres.go index 82698c168..65399a100 100644 --- a/grid-proxy/internal/explorer/db/postgres.go +++ b/grid-proxy/internal/explorer/db/postgres.go @@ -943,7 +943,7 @@ func (d *PostgresDatabase) GetContractBills(ctx context.Context, contractID uint func (d *PostgresDatabase) GetRandomHealthyTwinIds(length int) ([]uint32, error) { var ids []uint32 - if err := d.gormDB.Table("health_report").Select("node_twin_id").Where("healthy = true").Order("random()").Limit(int(length)).Scan(&ids).Error; err != nil { + if err := d.gormDB.Table("health_report").Select("node_twin_id").Where("healthy = true").Order("random()").Limit(length).Scan(&ids).Error; err != nil { return []uint32{}, err } return ids, nil From c79a759e57fae3468d8f30ce50f3067a3cffe050 Mon Sep 17 00:00:00 2001 From: Omar Abdulaziz Date: Tue, 28 May 2024 15:17:27 +0300 Subject: [PATCH 03/10] check only the table existence instead of querying --- grid-proxy/internal/explorer/db/postgres.go | 9 ++++++++- grid-proxy/internal/explorer/state.go | 6 ++---- 2 files changed, 10 insertions(+), 5 deletions(-) diff --git a/grid-proxy/internal/explorer/db/postgres.go b/grid-proxy/internal/explorer/db/postgres.go index 65399a100..35401f65b 100644 --- a/grid-proxy/internal/explorer/db/postgres.go +++ b/grid-proxy/internal/explorer/db/postgres.go @@ -97,9 +97,16 @@ func (d *PostgresDatabase) Initialized() error { initTables := []string{"node_gpu", "resources_cache"} for _, tableName := range initTables { - if _, err := db.Query("select * from " + tableName + ";"); err != nil { + query := "SELECT EXISTS (SELECT FROM pg_tables WHERE schemaname = 'public' AND tablename = $1);" + var exists bool + + if err := db.QueryRow(query, tableName).Scan(&exists); err != nil { return err } + + if !exists { + return fmt.Errorf("table %s does not exist", tableName) + } } return nil diff --git a/grid-proxy/internal/explorer/state.go b/grid-proxy/internal/explorer/state.go index 63c25b9eb..6fc65396c 100644 --- a/grid-proxy/internal/explorer/state.go +++ b/grid-proxy/internal/explorer/state.go @@ -19,12 +19,10 @@ func createReport(db DBClient, peer rmb.Client) types.Healthiness { // db connection report.DBConn = OkState - err := db.DB.Ping() - if err != nil { + if err := db.DB.Ping(); err != nil { report.DBConn = err.Error() } - err = db.DB.Initialized() - if err != nil { + if err := db.DB.Initialized(); err != nil { report.DBConn = err.Error() } From 680f91b89afc37328d5fcca0ece6cfb33a26f347 Mon Sep 17 00:00:00 2001 From: Omar Abdulaziz Date: Wed, 29 May 2024 00:25:04 +0300 Subject: [PATCH 04/10] add indexer status to healthiness check --- grid-proxy/cmds/proxy_server/main.go | 13 +++++++--- .../internal/explorer/db/indexer_calls.go | 10 ++++---- grid-proxy/internal/explorer/db/postgres.go | 23 +++++++++++++++++ grid-proxy/internal/explorer/db/types.go | 1 + grid-proxy/internal/explorer/models.go | 1 + grid-proxy/internal/explorer/server.go | 4 ++- grid-proxy/internal/explorer/state.go | 25 ++++++++++++++++++- grid-proxy/internal/indexer/dmi.go | 1 + grid-proxy/internal/indexer/health.go | 1 + grid-proxy/internal/indexer/ipv6.go | 1 + grid-proxy/internal/indexer/speed.go | 2 ++ grid-proxy/internal/indexer/workload.go | 1 + grid-proxy/pkg/types/indexer.go | 5 ++++ grid-proxy/pkg/types/version.go | 14 +++++++++++ grid-proxy/tools/db/schema.sql | 5 ++++ 15 files changed, 97 insertions(+), 10 deletions(-) diff --git a/grid-proxy/cmds/proxy_server/main.go b/grid-proxy/cmds/proxy_server/main.go index ed4a9f526..eebe872e6 100644 --- a/grid-proxy/cmds/proxy_server/main.go +++ b/grid-proxy/cmds/proxy_server/main.go @@ -146,13 +146,20 @@ func main() { log.Fatal().Err(err).Msg("failed to create relay client") } + indexerIntervals := make(map[string]uint) if !f.noIndexer { startIndexers(ctx, f, &db, rpcRmbClient) + indexerIntervals["gpu"] = f.gpuIndexerIntervalMins + indexerIntervals["health"] = f.healthIndexerIntervalMins + indexerIntervals["dmi"] = f.dmiIndexerIntervalMins + indexerIntervals["workloads"] = f.workloadsIndexerIntervalMins + indexerIntervals["ipv6"] = f.ipv6IndexerIntervalMins + indexerIntervals["speed"] = f.speedIndexerIntervalMins } else { log.Info().Msg("Indexers did not start") } - s, err := createServer(f, dbClient, GitCommit, rpcRmbClient) + s, err := createServer(f, dbClient, GitCommit, rpcRmbClient, indexerIntervals) if err != nil { log.Fatal().Err(err).Msg("failed to create mux server") } @@ -273,13 +280,13 @@ func createRPCRMBClient(ctx context.Context, relayURL, mnemonics string, subMana return client, nil } -func createServer(f flags, dbClient explorer.DBClient, gitCommit string, relayClient rmb.Client) (*http.Server, error) { +func createServer(f flags, dbClient explorer.DBClient, gitCommit string, relayClient rmb.Client, idxIntervals map[string]uint) (*http.Server, error) { log.Info().Msg("Creating server") router := mux.NewRouter().StrictSlash(true) // setup explorer - if err := explorer.Setup(router, gitCommit, dbClient, relayClient); err != nil { + if err := explorer.Setup(router, gitCommit, dbClient, relayClient, idxIntervals); err != nil { return nil, err } diff --git a/grid-proxy/internal/explorer/db/indexer_calls.go b/grid-proxy/internal/explorer/db/indexer_calls.go index 93dff1c9d..682e07804 100644 --- a/grid-proxy/internal/explorer/db/indexer_calls.go +++ b/grid-proxy/internal/explorer/db/indexer_calls.go @@ -40,7 +40,7 @@ func (p *PostgresDatabase) UpsertNodesGPU(ctx context.Context, gpus []types.Node func (p *PostgresDatabase) UpsertNodeHealth(ctx context.Context, healthReports []types.HealthReport) error { conflictClause := clause.OnConflict{ Columns: []clause.Column{{Name: "node_twin_id"}}, - DoUpdates: clause.AssignmentColumns([]string{"healthy"}), + DoUpdates: clause.AssignmentColumns([]string{"healthy", "updated_at"}), } return p.gormDB.WithContext(ctx).Table("health_report").Clauses(conflictClause).Create(&healthReports).Error } @@ -48,7 +48,7 @@ func (p *PostgresDatabase) UpsertNodeHealth(ctx context.Context, healthReports [ func (p *PostgresDatabase) UpsertNodeDmi(ctx context.Context, dmis []types.Dmi) error { conflictClause := clause.OnConflict{ Columns: []clause.Column{{Name: "node_twin_id"}}, - DoUpdates: clause.AssignmentColumns([]string{"bios", "baseboard", "processor", "memory"}), + DoUpdates: clause.AssignmentColumns([]string{"bios", "baseboard", "processor", "memory", "updated_at"}), } return p.gormDB.WithContext(ctx).Table("dmi").Clauses(conflictClause).Create(&dmis).Error } @@ -56,7 +56,7 @@ func (p *PostgresDatabase) UpsertNodeDmi(ctx context.Context, dmis []types.Dmi) func (p *PostgresDatabase) UpsertNetworkSpeed(ctx context.Context, speeds []types.Speed) error { conflictClause := clause.OnConflict{ Columns: []clause.Column{{Name: "node_twin_id"}}, - DoUpdates: clause.AssignmentColumns([]string{"download", "upload"}), + DoUpdates: clause.AssignmentColumns([]string{"download", "upload", "updated_at"}), } return p.gormDB.WithContext(ctx).Table("speed").Clauses(conflictClause).Create(&speeds).Error } @@ -64,7 +64,7 @@ func (p *PostgresDatabase) UpsertNetworkSpeed(ctx context.Context, speeds []type func (p *PostgresDatabase) UpsertNodeIpv6Report(ctx context.Context, ips []types.HasIpv6) error { onConflictClause := clause.OnConflict{ Columns: []clause.Column{{Name: "node_twin_id"}}, - DoUpdates: clause.AssignmentColumns([]string{"has_ipv6"}), + DoUpdates: clause.AssignmentColumns([]string{"has_ipv6", "updated_at"}), } return p.gormDB.WithContext(ctx).Table("node_ipv6").Clauses(onConflictClause).Create(&ips).Error } @@ -72,7 +72,7 @@ func (p *PostgresDatabase) UpsertNodeIpv6Report(ctx context.Context, ips []types func (p *PostgresDatabase) UpsertNodeWorkloads(ctx context.Context, workloads []types.NodesWorkloads) error { conflictClause := clause.OnConflict{ Columns: []clause.Column{{Name: "node_twin_id"}}, - DoUpdates: clause.AssignmentColumns([]string{"workloads_number"}), + DoUpdates: clause.AssignmentColumns([]string{"workloads_number", "updated_at"}), } return p.gormDB.WithContext(ctx).Table("node_workloads").Clauses(conflictClause).Create(&workloads).Error } diff --git a/grid-proxy/internal/explorer/db/postgres.go b/grid-proxy/internal/explorer/db/postgres.go index 35401f65b..7b837bbd6 100644 --- a/grid-proxy/internal/explorer/db/postgres.go +++ b/grid-proxy/internal/explorer/db/postgres.go @@ -112,6 +112,29 @@ func (d *PostgresDatabase) Initialized() error { return nil } +func (d *PostgresDatabase) GetLastUpsertsTimestamp() (types.IndexersState, error) { + var report types.IndexersState + if res := d.gormDB.Table("node_gpu").Select("updated_at").Where("updated_at IS NOT NULL").Order("updated_at DESC").Limit(1).Scan(&report.Gpu.UpdatedAt); res.Error != nil { + return report, errors.Wrap(res.Error, "couldn't get node_gpu last updated_at") + } + if res := d.gormDB.Table("health_report").Select("updated_at").Where("updated_at IS NOT NULL").Order("updated_at DESC").Limit(1).Scan(&report.Health.UpdatedAt); res.Error != nil { + return report, errors.Wrap(res.Error, "couldn't get health_report last updated_at") + } + if res := d.gormDB.Table("node_ipv6").Select("updated_at").Where("updated_at IS NOT NULL").Order("updated_at DESC").Limit(1).Scan(&report.Ipv6.UpdatedAt); res.Error != nil { + return report, errors.Wrap(res.Error, "couldn't get node_ipv6 last updated_at") + } + if res := d.gormDB.Table("speed").Select("updated_at").Where("updated_at IS NOT NULL").Order("updated_at DESC").Limit(1).Scan(&report.Speed.UpdatedAt); res.Error != nil { + return report, errors.Wrap(res.Error, "couldn't get speed last updated_at") + } + if res := d.gormDB.Table("dmi").Select("updated_at").Where("updated_at IS NOT NULL").Order("updated_at DESC").Limit(1).Scan(&report.Dmi.UpdatedAt); res.Error != nil { + return report, errors.Wrap(res.Error, "couldn't get dmi last updated_at") + } + if res := d.gormDB.Table("node_workloads").Select("updated_at").Where("updated_at IS NOT NULL").Order("updated_at DESC").Limit(1).Scan(&report.Workloads.UpdatedAt); res.Error != nil { + return report, errors.Wrap(res.Error, "couldn't get workloads last updated_at") + } + return report, nil +} + func (d *PostgresDatabase) Initialize() error { err := d.gormDB.AutoMigrate( &types.NodeGPU{}, diff --git a/grid-proxy/internal/explorer/db/types.go b/grid-proxy/internal/explorer/db/types.go index 19ed50f1a..f63997a70 100644 --- a/grid-proxy/internal/explorer/db/types.go +++ b/grid-proxy/internal/explorer/db/types.go @@ -12,6 +12,7 @@ type Database interface { Ping() error Initialized() error GetRandomHealthyTwinIds(length int) ([]uint32, error) + GetLastUpsertsTimestamp() (types.IndexersState, error) // server getters GetStats(ctx context.Context, filter types.StatsFilter) (types.Stats, error) diff --git a/grid-proxy/internal/explorer/models.go b/grid-proxy/internal/explorer/models.go index 3da046b44..77abfeaad 100644 --- a/grid-proxy/internal/explorer/models.go +++ b/grid-proxy/internal/explorer/models.go @@ -26,6 +26,7 @@ type App struct { cl DBClient releaseVersion string relayClient rmb.Client + idxIntervals map[string]uint } type ErrorMessage struct { diff --git a/grid-proxy/internal/explorer/server.go b/grid-proxy/internal/explorer/server.go index 03f64c8bc..541c38757 100644 --- a/grid-proxy/internal/explorer/server.go +++ b/grid-proxy/internal/explorer/server.go @@ -415,6 +415,7 @@ func (a *App) healthiness(r *http.Request) (interface{}, mw.Response) { return createReport( a.cl, a.relayClient, + a.idxIntervals, ), response } @@ -548,12 +549,13 @@ func (a *App) getContractBills(r *http.Request) (interface{}, mw.Response) { // @license.name Apache 2.0 // @license.url http://www.apache.org/licenses/LICENSE-2.0.html // @BasePath / -func Setup(router *mux.Router, gitCommit string, cl DBClient, relayClient rmb.Client) error { +func Setup(router *mux.Router, gitCommit string, cl DBClient, relayClient rmb.Client, idxIntervals map[string]uint) error { a := App{ cl: cl, releaseVersion: gitCommit, relayClient: relayClient, + idxIntervals: idxIntervals, } router.HandleFunc("/farms", mw.AsHandlerFunc(a.listFarms)) diff --git a/grid-proxy/internal/explorer/state.go b/grid-proxy/internal/explorer/state.go index 6fc65396c..b727f6a38 100644 --- a/grid-proxy/internal/explorer/state.go +++ b/grid-proxy/internal/explorer/state.go @@ -6,6 +6,7 @@ import ( "sync" "time" + "github.com/rs/zerolog/log" "github.com/threefoldtech/tfgrid-sdk-go/grid-proxy/pkg/types" "github.com/threefoldtech/tfgrid-sdk-go/rmb-sdk-go" ) @@ -14,7 +15,7 @@ const ( OkState = "ok" ) -func createReport(db DBClient, peer rmb.Client) types.Healthiness { +func createReport(db DBClient, peer rmb.Client, idxIntervals map[string]uint) types.Healthiness { var report types.Healthiness // db connection @@ -32,15 +33,37 @@ func createReport(db DBClient, peer rmb.Client) types.Healthiness { report.RMBConn = err.Error() } + // indexers + indexers, err := db.DB.GetLastUpsertsTimestamp() + if err != nil { + log.Error().Err(err).Msg("failed to get last upsert timestamp") + } + report.Indexers = indexers + // total report.TotalStateOk = true if report.DBConn != OkState || report.RMBConn != OkState { report.TotalStateOk = false } + + if isIndexerStale(indexers.Dmi.UpdatedAt, idxIntervals["dmi"]) || + isIndexerStale(indexers.Gpu.UpdatedAt, idxIntervals["gpu"]) || + isIndexerStale(indexers.Health.UpdatedAt, idxIntervals["health"]) || + isIndexerStale(indexers.Ipv6.UpdatedAt, idxIntervals["ipv6"]) || + isIndexerStale(indexers.Speed.UpdatedAt, idxIntervals["speed"]) || + isIndexerStale(indexers.Workloads.UpdatedAt, idxIntervals["workloads"]) { + report.TotalStateOk = false + } + return report } +func isIndexerStale(updatedAt int64, interval uint) bool { + updatedAtInTime := time.Unix(updatedAt, 0) + return time.Now().Sub(updatedAtInTime) > time.Duration(interval)*time.Minute +} + func pingRandomTwins(db DBClient, peer rmb.Client) error { twinIds, err := db.DB.GetRandomHealthyTwinIds(10) if err != nil { diff --git a/grid-proxy/internal/indexer/dmi.go b/grid-proxy/internal/indexer/dmi.go index b7e998b0d..c6e9a4cb9 100644 --- a/grid-proxy/internal/indexer/dmi.go +++ b/grid-proxy/internal/indexer/dmi.go @@ -92,5 +92,6 @@ func parseDmiResponse(dmiResponse zosDmiTypes.DMI, twinId uint32) types.Dmi { } info.NodeTwinId = twinId + info.UpdatedAt = time.Now().Unix() return info } diff --git a/grid-proxy/internal/indexer/health.go b/grid-proxy/internal/indexer/health.go index d6b833eca..cf891e7ba 100644 --- a/grid-proxy/internal/indexer/health.go +++ b/grid-proxy/internal/indexer/health.go @@ -47,6 +47,7 @@ func getHealthReport(response interface{}, err error, twinId uint32) types.Healt report := types.HealthReport{ NodeTwinId: twinId, Healthy: false, + UpdatedAt: time.Now().Unix(), } if err != nil { diff --git a/grid-proxy/internal/indexer/ipv6.go b/grid-proxy/internal/indexer/ipv6.go index 0020fa183..b2abba6ff 100644 --- a/grid-proxy/internal/indexer/ipv6.go +++ b/grid-proxy/internal/indexer/ipv6.go @@ -39,6 +39,7 @@ func (w *Ipv6Work) Get(ctx context.Context, rmb *peer.RpcClient, id uint32) ([]t { NodeTwinId: id, HasIpv6: has_ipv6, + UpdatedAt: time.Now().Unix(), }, }, nil } diff --git a/grid-proxy/internal/indexer/speed.go b/grid-proxy/internal/indexer/speed.go index a29fec64c..eed2f8504 100644 --- a/grid-proxy/internal/indexer/speed.go +++ b/grid-proxy/internal/indexer/speed.go @@ -83,5 +83,7 @@ func parseSpeed(res zosPerfPkg.TaskResult, twinId uint32) (types.Speed, error) { } } + speed.UpdatedAt = time.Now().Unix() + return speed, nil } diff --git a/grid-proxy/internal/indexer/workload.go b/grid-proxy/internal/indexer/workload.go index 3db7d8d9a..d1a3f4bdd 100644 --- a/grid-proxy/internal/indexer/workload.go +++ b/grid-proxy/internal/indexer/workload.go @@ -44,6 +44,7 @@ func (w *WorkloadWork) Get(ctx context.Context, rmb *peer.RpcClient, twinId uint { NodeTwinId: twinId, WorkloadsNumber: response.Users.Workloads, + UpdatedAt: time.Now().Unix(), }, }, nil } diff --git a/grid-proxy/pkg/types/indexer.go b/grid-proxy/pkg/types/indexer.go index b8a920634..cc30490e4 100644 --- a/grid-proxy/pkg/types/indexer.go +++ b/grid-proxy/pkg/types/indexer.go @@ -20,6 +20,7 @@ func (NodeGPU) TableName() string { type HealthReport struct { NodeTwinId uint32 `gorm:"unique;not null"` Healthy bool + UpdatedAt int64 } func (HealthReport) TableName() string { @@ -31,6 +32,7 @@ func (HealthReport) TableName() string { type HasIpv6 struct { NodeTwinId uint32 `gorm:"unique;not null"` HasIpv6 bool + UpdatedAt int64 } func (HasIpv6) TableName() string { @@ -43,6 +45,7 @@ type Speed struct { NodeTwinId uint32 `json:"node_twin_id,omitempty" gorm:"unique;not null"` Upload float64 `json:"upload"` // in bit/sec Download float64 `json:"download"` // in bit/sec + UpdatedAt int64 } func (Speed) TableName() string { @@ -53,6 +56,7 @@ func (Speed) TableName() string { type NodesWorkloads struct { NodeTwinId uint32 `json:"node_twin_id,omitempty" gorm:"unique;not null"` WorkloadsNumber uint32 `json:"workloads_number"` + UpdatedAt int64 } func (NodesWorkloads) TableName() string { @@ -67,6 +71,7 @@ type Dmi struct { Baseboard Baseboard `json:"baseboard" gorm:"type:jsonb;serializer:json"` Processor []Processor `json:"processor" gorm:"type:jsonb;serializer:json"` Memory []Memory `json:"memory" gorm:"type:jsonb;serializer:json"` + UpdatedAt int64 } func (Dmi) TableName() string { diff --git a/grid-proxy/pkg/types/version.go b/grid-proxy/pkg/types/version.go index 96b92cd8d..a8e71543a 100644 --- a/grid-proxy/pkg/types/version.go +++ b/grid-proxy/pkg/types/version.go @@ -5,9 +5,23 @@ type Version struct { Version string `json:"version"` } +type IndexerState struct { + UpdatedAt int64 `json:"updated_at"` +} + +type IndexersState struct { + Gpu IndexerState `json:"gpu"` + Health IndexerState `json:"health"` + Dmi IndexerState `json:"dmi"` + Speed IndexerState `json:"speed"` + Ipv6 IndexerState `json:"ipv6"` + Workloads IndexerState `json:"workloads"` +} + // Healthiness represent the healthiness of the server and connections type Healthiness struct { TotalStateOk bool `json:"total_state_ok"` DBConn string `json:"db_conn"` RMBConn string `json:"rmb_conn"` + Indexers IndexersState } diff --git a/grid-proxy/tools/db/schema.sql b/grid-proxy/tools/db/schema.sql index 1a49ec0a0..d8c120cd6 100644 --- a/grid-proxy/tools/db/schema.sql +++ b/grid-proxy/tools/db/schema.sql @@ -1047,6 +1047,7 @@ ALTER TABLE ONLY public.node_gpu CREATE TABLE IF NOT EXISTS public.health_report ( node_twin_id bigint NOT NULL, healthy boolean + updated_at bigint ); ALTER TABLE public.health_report @@ -1063,6 +1064,7 @@ CREATE TABLE public.dmi( baseboard jsonb, processor jsonb, memory jsonb + updated_at bigint ); ALTER TABLE public.dmi @@ -1076,6 +1078,7 @@ CREATE TABLE public.speed( node_twin_id bigint NOT NULL, upload numeric, download numeric + updated_at bigint ); ALTER TABLE public.speed @@ -1088,6 +1091,7 @@ ALTER TABLE public.speed CREATE TABLE IF NOT EXISTS public.node_ipv6 ( node_twin_id bigint NOT NULL, has_ipv6 boolean + updated_at bigint ); ALTER TABLE public.node_ipv6 @@ -1101,6 +1105,7 @@ ALTER TABLE public.node_ipv6 CREATE TABLE IF NOT EXISTS public.node_workloads ( node_twin_id bigint NOT NULL, workloads_number numeric + updated_at bigint ); ALTER TABLE public.node_workloads From c2925502d977e1e0bd4bfdaf2caf85cd86723353 Mon Sep 17 00:00:00 2001 From: Omar Abdulaziz Date: Wed, 29 May 2024 00:29:53 +0300 Subject: [PATCH 05/10] fix workload indexer interval/workers --- grid-proxy/cmds/proxy_server/main.go | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/grid-proxy/cmds/proxy_server/main.go b/grid-proxy/cmds/proxy_server/main.go index eebe872e6..cade989a3 100644 --- a/grid-proxy/cmds/proxy_server/main.go +++ b/grid-proxy/cmds/proxy_server/main.go @@ -217,11 +217,11 @@ func startIndexers(ctx context.Context, f flags, db db.Database, rpcRmbClient *p ipv6Idx.Start(ctx) wlNumIdx := indexer.NewIndexer[types.NodesWorkloads]( - indexer.NewWorkloadWork(f.ipv6IndexerIntervalMins), + indexer.NewWorkloadWork(f.workloadsIndexerIntervalMins), "workloads", db, rpcRmbClient, - f.ipv6IndexerNumWorkers, + f.workloadsIndexerNumWorkers, ) wlNumIdx.Start(ctx) } From 85e4e611fda667c1bb07622b7039f8d517e8c990 Mon Sep 17 00:00:00 2001 From: Omar Abdulaziz Date: Wed, 29 May 2024 00:36:05 +0300 Subject: [PATCH 06/10] rename endpoint to health --- grid-proxy/internal/explorer/server.go | 4 ++-- grid-proxy/internal/explorer/state.go | 4 ++-- grid-proxy/pkg/types/version.go | 12 ++++++------ 3 files changed, 10 insertions(+), 10 deletions(-) diff --git a/grid-proxy/internal/explorer/server.go b/grid-proxy/internal/explorer/server.go index 541c38757..3c594df07 100644 --- a/grid-proxy/internal/explorer/server.go +++ b/grid-proxy/internal/explorer/server.go @@ -410,7 +410,7 @@ func (a *App) version(r *http.Request) (interface{}, mw.Response) { }, response } -func (a *App) healthiness(r *http.Request) (interface{}, mw.Response) { +func (a *App) health(r *http.Request) (interface{}, mw.Response) { response := mw.Ok() return createReport( a.cl, @@ -579,7 +579,7 @@ func Setup(router *mux.Router, gitCommit string, cl DBClient, relayClient rmb.Cl router.HandleFunc("/", mw.AsHandlerFunc(a.indexPage(router))) router.HandleFunc("/ping", mw.AsHandlerFunc(a.ping)) router.HandleFunc("/version", mw.AsHandlerFunc(a.version)) - router.HandleFunc("/healthiness", mw.AsHandlerFunc(a.healthiness)) + router.HandleFunc("/health", mw.AsHandlerFunc(a.health)) router.PathPrefix("/swagger/").Handler(httpSwagger.WrapHandler) return nil diff --git a/grid-proxy/internal/explorer/state.go b/grid-proxy/internal/explorer/state.go index b727f6a38..6810957f0 100644 --- a/grid-proxy/internal/explorer/state.go +++ b/grid-proxy/internal/explorer/state.go @@ -15,8 +15,8 @@ const ( OkState = "ok" ) -func createReport(db DBClient, peer rmb.Client, idxIntervals map[string]uint) types.Healthiness { - var report types.Healthiness +func createReport(db DBClient, peer rmb.Client, idxIntervals map[string]uint) types.Health { + var report types.Health // db connection report.DBConn = OkState diff --git a/grid-proxy/pkg/types/version.go b/grid-proxy/pkg/types/version.go index a8e71543a..e1636168c 100644 --- a/grid-proxy/pkg/types/version.go +++ b/grid-proxy/pkg/types/version.go @@ -18,10 +18,10 @@ type IndexersState struct { Workloads IndexerState `json:"workloads"` } -// Healthiness represent the healthiness of the server and connections -type Healthiness struct { - TotalStateOk bool `json:"total_state_ok"` - DBConn string `json:"db_conn"` - RMBConn string `json:"rmb_conn"` - Indexers IndexersState +// Health represent the healthiness of the server and connections +type Health struct { + TotalStateOk bool `json:"total_state_ok"` + DBConn string `json:"db_conn"` + RMBConn string `json:"rmb_conn"` + Indexers IndexersState `json:"indexers"` } From efd451ac87d0e2437be1dd199d1878ba6a80eb0e Mon Sep 17 00:00:00 2001 From: Omar Abdulaziz Date: Wed, 29 May 2024 00:36:37 +0300 Subject: [PATCH 07/10] rename file to health --- grid-proxy/internal/explorer/{state.go => health.go} | 0 1 file changed, 0 insertions(+), 0 deletions(-) rename grid-proxy/internal/explorer/{state.go => health.go} (100%) diff --git a/grid-proxy/internal/explorer/state.go b/grid-proxy/internal/explorer/health.go similarity index 100% rename from grid-proxy/internal/explorer/state.go rename to grid-proxy/internal/explorer/health.go From df78afe0df467dced4aa42cc444af0cc3aa4b68e Mon Sep 17 00:00:00 2001 From: Omar Abdulaziz Date: Wed, 29 May 2024 12:32:01 +0300 Subject: [PATCH 08/10] add updated_at field in generator and loader for tests --- grid-proxy/tests/queries/mock_client/loader.go | 17 +++++++++++++---- grid-proxy/tools/db/crafter/generator.go | 5 +++++ grid-proxy/tools/db/schema.sql | 10 +++++----- 3 files changed, 23 insertions(+), 9 deletions(-) diff --git a/grid-proxy/tests/queries/mock_client/loader.go b/grid-proxy/tests/queries/mock_client/loader.go index 2840a3f08..515890838 100644 --- a/grid-proxy/tests/queries/mock_client/loader.go +++ b/grid-proxy/tests/queries/mock_client/loader.go @@ -550,7 +550,8 @@ func loadHealthReports(db *sql.DB, data *DBData) error { rows, err := db.Query(` SELECT COALESCE(node_twin_id, 0), - COALESCE(healthy, false) + COALESCE(healthy, false), + COALESCE(updated_at, 0) FROM health_report;`) if err != nil { @@ -561,6 +562,7 @@ func loadHealthReports(db *sql.DB, data *DBData) error { if err := rows.Scan( &health.NodeTwinId, &health.Healthy, + &health.UpdatedAt, ); err != nil { return err } @@ -574,7 +576,8 @@ func loadNodeIpv6(db *sql.DB, data *DBData) error { rows, err := db.Query(` SELECT COALESCE(node_twin_id, 0), - COALESCE(has_ipv6, false) + COALESCE(has_ipv6, false), + COALESCE(updated_at, 0) FROM node_ipv6;`) if err != nil { @@ -585,6 +588,7 @@ func loadNodeIpv6(db *sql.DB, data *DBData) error { if err := rows.Scan( &node.NodeTwinId, &node.HasIpv6, + &node.UpdatedAt, ); err != nil { return err } @@ -603,6 +607,7 @@ func loadDMIs(db *sql.DB, gormDB *gorm.DB, data *DBData) error { for _, dmi := range dmis { twinId := dmi.NodeTwinId dmi.NodeTwinId = 0 // to omit it as empty, cleaner response + dmi.UpdatedAt = 0 data.DMIs[twinId] = dmi } @@ -614,7 +619,8 @@ func loadSpeeds(db *sql.DB, data *DBData) error { SELECT node_twin_id, upload, - download + download, + updated_at FROM speed;`) if err != nil { @@ -626,6 +632,7 @@ func loadSpeeds(db *sql.DB, data *DBData) error { &speed.NodeTwinId, &speed.Upload, &speed.Download, + &speed.UpdatedAt, ); err != nil { return err } @@ -638,7 +645,8 @@ func loadWorkloadsNumber(db *sql.DB, data *DBData) error { rows, err := db.Query(` SELECT node_twin_id, - workloads_number + workloads_number, + updated_at FROM node_workloads; `) @@ -652,6 +660,7 @@ func loadWorkloadsNumber(db *sql.DB, data *DBData) error { if err := rows.Scan( &wl.NodeTwinId, &wl.WorkloadsNumber, + &wl.UpdatedAt, ); err != nil { return err } diff --git a/grid-proxy/tools/db/crafter/generator.go b/grid-proxy/tools/db/crafter/generator.go index 3fae20bfb..39312eae6 100644 --- a/grid-proxy/tools/db/crafter/generator.go +++ b/grid-proxy/tools/db/crafter/generator.go @@ -852,6 +852,7 @@ func (c *Crafter) GenerateSpeedReports() error { NodeTwinId: uint32(nodeTwinsStart + i), Upload: rand.Float64() * float64(rand.Intn(9999999)), Download: rand.Float64() * float64(rand.Intn(9999999)), + UpdatedAt: time.Now().Unix(), } speedReports = append(speedReports, speedReport) } @@ -877,6 +878,7 @@ func (c *Crafter) GenerateDmi() error { Baseboard: baseboard[rand.Intn(len(baseboard))], Processor: processor[:rand.Intn(len(processor))], Memory: memory[:rand.Intn(len(memory))], + UpdatedAt: time.Now().Unix(), } dmis = append(dmis, dmi) } @@ -904,6 +906,7 @@ func (c *Crafter) GenerateHealthReports() error { healthReport := types.HealthReport{ NodeTwinId: uint32(nodeTwinsStart + i), Healthy: health, + UpdatedAt: time.Now().Unix(), } healthReports = append(healthReports, healthReport) } @@ -953,6 +956,7 @@ func (c *Crafter) GenerateNodeIpv6() error { report := types.HasIpv6{ NodeTwinId: uint32(nodeTwinsStart + i), HasIpv6: has_ipv6, + UpdatedAt: time.Now().Unix(), } reports = append(reports, report) } @@ -975,6 +979,7 @@ func (c *Crafter) GenerateNodeWorkloads() error { report := types.NodesWorkloads{ NodeTwinId: uint32(nodeTwinsStart + i), WorkloadsNumber: uint32(rand.Intn(120)), + UpdatedAt: time.Now().Unix(), } reports = append(reports, report) } diff --git a/grid-proxy/tools/db/schema.sql b/grid-proxy/tools/db/schema.sql index d8c120cd6..c7db15b00 100644 --- a/grid-proxy/tools/db/schema.sql +++ b/grid-proxy/tools/db/schema.sql @@ -1046,7 +1046,7 @@ ALTER TABLE ONLY public.node_gpu CREATE TABLE IF NOT EXISTS public.health_report ( node_twin_id bigint NOT NULL, - healthy boolean + healthy boolean, updated_at bigint ); @@ -1063,7 +1063,7 @@ CREATE TABLE public.dmi( bios jsonb, baseboard jsonb, processor jsonb, - memory jsonb + memory jsonb, updated_at bigint ); @@ -1077,7 +1077,7 @@ ALTER TABLE public.dmi CREATE TABLE public.speed( node_twin_id bigint NOT NULL, upload numeric, - download numeric + download numeric, updated_at bigint ); @@ -1090,7 +1090,7 @@ ALTER TABLE public.speed CREATE TABLE IF NOT EXISTS public.node_ipv6 ( node_twin_id bigint NOT NULL, - has_ipv6 boolean + has_ipv6 boolean, updated_at bigint ); @@ -1104,7 +1104,7 @@ ALTER TABLE public.node_ipv6 CREATE TABLE IF NOT EXISTS public.node_workloads ( node_twin_id bigint NOT NULL, - workloads_number numeric + workloads_number numeric, updated_at bigint ); From babcaf261fa29fbc8da21b7d075bf41c59147b7e Mon Sep 17 00:00:00 2001 From: Omar Abdulaziz Date: Wed, 29 May 2024 12:37:40 +0300 Subject: [PATCH 09/10] use time.Since() --- grid-proxy/internal/explorer/health.go | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/grid-proxy/internal/explorer/health.go b/grid-proxy/internal/explorer/health.go index 6810957f0..efa4bd75c 100644 --- a/grid-proxy/internal/explorer/health.go +++ b/grid-proxy/internal/explorer/health.go @@ -61,7 +61,7 @@ func createReport(db DBClient, peer rmb.Client, idxIntervals map[string]uint) ty func isIndexerStale(updatedAt int64, interval uint) bool { updatedAtInTime := time.Unix(updatedAt, 0) - return time.Now().Sub(updatedAtInTime) > time.Duration(interval)*time.Minute + return time.Since(updatedAtInTime) > time.Duration(interval)*time.Minute } func pingRandomTwins(db DBClient, peer rmb.Client) error { From 8f85c7beab8f3147c92e45bcc63e846fd25df73e Mon Sep 17 00:00:00 2001 From: Omar Abdulaziz Date: Sun, 9 Jun 2024 13:04:07 +0300 Subject: [PATCH 10/10] update proxy changelog/chart for v0.15.8 --- grid-proxy/CHANGELOG.md | 6 ++++++ grid-proxy/charts/gridproxy/Chart.yaml | 2 +- 2 files changed, 7 insertions(+), 1 deletion(-) diff --git a/grid-proxy/CHANGELOG.md b/grid-proxy/CHANGELOG.md index 22a19505e..db15df589 100644 --- a/grid-proxy/CHANGELOG.md +++ b/grid-proxy/CHANGELOG.md @@ -12,6 +12,12 @@ Check `/version` on any instance to know the version. ## Releases +### v0.15.8 + +--- + +- `feat` add `/health` endpoint + ### v0.15.7 --- diff --git a/grid-proxy/charts/gridproxy/Chart.yaml b/grid-proxy/charts/gridproxy/Chart.yaml index bc45f7fda..ed974f30b 100644 --- a/grid-proxy/charts/gridproxy/Chart.yaml +++ b/grid-proxy/charts/gridproxy/Chart.yaml @@ -20,6 +20,6 @@ version: 1.0.0 # This is the version number of the application being deployed. This version number should be # incremented each time you make changes to the application. Versions are not expected to # follow Semantic Versioning. They should reflect the version the application is using. -appVersion: 0.15.7 +appVersion: 0.15.8 # make sure to update the changelog with the changes in this release