diff --git a/.github/workflows/tests.yml b/.github/workflows/tests.yml new file mode 100644 index 0000000..0a4abb5 --- /dev/null +++ b/.github/workflows/tests.yml @@ -0,0 +1,45 @@ +name: Go Tests + +on: + push: + branches: [ main, master ] + paths: + - '**.go' + - 'go.mod' + - 'go.sum' + pull_request: + branches: [ main, master ] + paths: + - '**.go' + - 'go.mod' + - 'go.sum' + +jobs: + test: + name: Run Tests + runs-on: ubuntu-latest + steps: + - name: Checkout code + uses: actions/checkout@v4 + + - name: Set up Go + uses: actions/setup-go@v5 + with: + go-version: '1.21' + check-latest: true + + - name: Get dependencies + run: go mod tidy + + - name: Run tests + run: go test -v -coverprofile=coverage.out ./... + + - name: Display coverage + run: go tool cover -func=coverage.out + + - name: Upload coverage report + uses: actions/upload-artifact@v4 + with: + name: coverage-report + path: coverage.out + if-no-files-found: error \ No newline at end of file diff --git a/.gitignore b/.gitignore index 2eea525..fef0321 100644 --- a/.gitignore +++ b/.gitignore @@ -1 +1,2 @@ -.env \ No newline at end of file +.env +coverage.out \ No newline at end of file diff --git a/README.md b/README.md index 756f31c..e67a876 100644 --- a/README.md +++ b/README.md @@ -1,133 +1,71 @@ -# System Monitoring +# Appwrite System Monitoring -A lightweight system monitoring tool that tracks CPU, memory, and disk usage across your infrastructure. When resource usage exceeds defined thresholds, it creates incidents in BetterStack. +A system monitoring tool for Appwrite servers that tracks CPU, memory, and disk usage with alerting via BetterStack. ## Features -- CPU usage monitoring -- Memory usage monitoring -- Disk usage monitoring (root and mounted volumes) -- Automatic incident creation and resolution -- Configurable thresholds via CLI -- Docker-based deployment +- Monitors CPU usage with configurable thresholds +- Monitors memory usage with configurable thresholds +- Monitors disk usage (root and mounted volumes) with configurable thresholds +- Uses Exponential Moving Average (EMA) to smooth out short-term spikes +- Sends alerts to BetterStack when thresholds are exceeded -## Command Line Usage +## Project Structure -The monitoring tool is configured through command-line flags: +The project follows a simplified package structure: -```bash -monitoring [flags] - -Flags: - -url string - BetterStack webhook URL (required) - -interval int - Check interval in seconds (default: 300) - -cpu-limit float - CPU usage threshold percentage (default: 90) - -memory-limit float - Memory usage threshold percentage (default: 90) - -disk-limit float - Disk usage threshold percentage (default: 85) - -help - Display help information -``` - -### Examples - -```bash -# Basic usage with required URL -monitoring --url=https://betterstack.com/webhook/xyz - -# Custom thresholds -monitoring --url=https://betterstack.com/webhook/xyz \ - --cpu-limit=95 \ - --memory-limit=85 \ - --disk-limit=80 - -# More frequent checks (every minute) -monitoring --url=https://betterstack.com/webhook/xyz --interval=60 ``` - -## Docker Deployment - -### Using Docker Run - -```bash -docker run -d \ - --name monitoring \ - --privileged \ - --pid=host \ - -v /:/host:ro \ - ghcr.io/appwrite/monitoring:latest \ - monitoring \ - --url=https://betterstack.com/webhook/xyz \ - --interval=300 \ - --cpu-limit=90 \ - --memory-limit=90 \ - --disk-limit=85 -``` - -### Using Docker Compose - -The docker-compose.yml file is configured with default parameters that you can modify as needed: - -```bash -docker-compose up -d +monitoring/ +├── main.go # Entry point with CLI argument parsing +├── pkg/ +│ ├── logger.go # Logging functionality at package level +│ └── monitor/ # Core monitoring functionality +│ ├── monitor.go # Main monitoring struct and Metric model +│ ├── cpu.go # CPU-specific monitoring +│ ├── memory.go # Memory-specific monitoring +│ └── disk.go # Disk-specific monitoring +├── go.mod # Go module definition +└── README.md # Documentation ``` -To modify the parameters, edit the command section in docker-compose.yml: -```yaml -command: - - monitoring - - "--url=https://betterstack.com/webhook/xyz" - - "--interval=10" - - "--cpu-limit=90" - - "--memory-limit=80" - - "--disk-limit=85" -``` +## Usage -## Building from Source +### Building -1. Clone the repository: ```bash +# Clone the repository git clone https://github.com/appwrite/monitoring.git cd monitoring -``` - -2. Build the binary: -```bash -go build -o monitoring -``` -3. Run the monitoring tool: -```bash -monitoring --url=https://betterstack.com/webhook/xyz +# Build the binary +go build -o appwrite-monitor main.go ``` -## Development - -### Requirements -- Go 1.21 or later -- Docker and Docker Compose (for containerized deployment) +### Running -### Local Development -1. Install dependencies: ```bash -go mod download +# Basic usage +./appwrite-monitor --url="https://betterstack-webhook-url" + +# With custom thresholds +./appwrite-monitor \ + --url="https://betterstack-webhook-url" \ + --interval=60 \ + --cpu-limit=80 \ + --memory-limit=85 \ + --disk-limit=90 ``` -2. Build and run: -```bash -go build -o monitoring -monitoring --url=https://betterstack.com/webhook/xyz -``` +### Command Line Options -### Docker Development -``` -docker compose up -d -``` +- `--url`: BetterStack webhook URL (required) +- `--interval`: Check interval in seconds (default: 300) +- `--cpu-limit`: CPU usage threshold percentage (default: 90) +- `--memory-limit`: Memory usage threshold percentage (default: 90) +- `--disk-limit`: Disk usage threshold percentage (default: 85) + +## How It Works -## License +The monitoring tool uses Exponential Moving Average (EMA) to track resource usage over time, which helps prevent false alerts from momentary spikes. When the EMA of a resource exceeds the configured threshold, an alert is sent to BetterStack. -MIT License - see the [LICENSE](LICENSE) file for details +The EMA smoothing factor is automatically calculated based on the check interval to provide roughly 5 minutes of smoothing. This means that sudden spikes will have less impact on the reported values, while sustained high usage will still trigger alerts. diff --git a/go.mod b/go.mod index f881372..2e14ffd 100644 --- a/go.mod +++ b/go.mod @@ -2,15 +2,15 @@ module github.com/appwrite/monitoring go 1.19 -require github.com/shirou/gopsutil/v3 v3.24.1 +require github.com/shirou/gopsutil/v3 v3.23.7 require ( github.com/go-ole/go-ole v1.2.6 // indirect github.com/lufia/plan9stats v0.0.0-20211012122336-39d0f177ccd0 // indirect github.com/power-devops/perfstat v0.0.0-20210106213030-5aafc221ea8c // indirect github.com/shoenig/go-m1cpu v0.1.6 // indirect - github.com/tklauser/go-sysconf v0.3.12 // indirect - github.com/tklauser/numcpus v0.6.1 // indirect + github.com/tklauser/go-sysconf v0.3.11 // indirect + github.com/tklauser/numcpus v0.6.0 // indirect github.com/yusufpapurcu/wmi v1.2.3 // indirect - golang.org/x/sys v0.16.0 // indirect + golang.org/x/sys v0.10.0 // indirect ) diff --git a/go.sum b/go.sum index ad5810c..c15f65b 100644 --- a/go.sum +++ b/go.sum @@ -4,17 +4,16 @@ github.com/davecgh/go-spew v1.1.1/go.mod h1:J7Y8YcW2NihsgmVo/mv3lAwl/skON4iLHjSs github.com/go-ole/go-ole v1.2.6 h1:/Fpf6oFPoeFik9ty7siob0G6Ke8QvQEuVcuChpwXzpY= github.com/go-ole/go-ole v1.2.6/go.mod h1:pprOEPIfldk/42T2oK7lQ4v4JSDwmV0As9GaiUsvbm0= github.com/google/go-cmp v0.5.6/go.mod h1:v8dTdLbMG2kIc/vJvl+f65V22dbkXbowE6jgT/gNBxE= +github.com/google/go-cmp v0.5.9 h1:O2Tfq5qg4qc4AmwVlvv0oLiVAGB7enBSJ2x2DqQFi38= github.com/google/go-cmp v0.5.9/go.mod h1:17dUlkBOakJ0+DkrSSNjCkIjxS6bF9zb3elmeNGIjoY= -github.com/google/go-cmp v0.6.0 h1:ofyhxvXcZhMsU5ulbFiLKl/XBFqE1GSq7atu8tAmTRI= -github.com/google/go-cmp v0.6.0/go.mod h1:17dUlkBOakJ0+DkrSSNjCkIjxS6bF9zb3elmeNGIjoY= github.com/lufia/plan9stats v0.0.0-20211012122336-39d0f177ccd0 h1:6E+4a0GO5zZEnZ81pIr0yLvtUWk2if982qA3F3QD6H4= github.com/lufia/plan9stats v0.0.0-20211012122336-39d0f177ccd0/go.mod h1:zJYVVT2jmtg6P3p1VtQj7WsuWi/y4VnjVBn7F8KPB3I= github.com/pmezard/go-difflib v1.0.0 h1:4DBwDE0NGyQoBHbLQYPwSUPoCMWR5BEzIk/f1lZbAQM= github.com/pmezard/go-difflib v1.0.0/go.mod h1:iKH77koFhYxTK1pcRnkKkqfTogsbg7gZNVY4sRDYZ/4= github.com/power-devops/perfstat v0.0.0-20210106213030-5aafc221ea8c h1:ncq/mPwQF4JjgDlrVEn3C11VoGHZN7m8qihwgMEtzYw= github.com/power-devops/perfstat v0.0.0-20210106213030-5aafc221ea8c/go.mod h1:OmDBASR4679mdNQnz2pUhc2G8CO2JrUAVFDRBDP/hJE= -github.com/shirou/gopsutil/v3 v3.24.1 h1:R3t6ondCEvmARp3wxODhXMTLC/klMa87h2PHUw5m7QI= -github.com/shirou/gopsutil/v3 v3.24.1/go.mod h1:UU7a2MSBQa+kW1uuDq8DeEBS8kmrnQwsv2b5O513rwU= +github.com/shirou/gopsutil/v3 v3.23.7 h1:C+fHO8hfIppoJ1WdsVm1RoI0RwXoNdfTK7yWXV0wVj4= +github.com/shirou/gopsutil/v3 v3.23.7/go.mod h1:c4gnmoRC0hQuaLqvxnx1//VXQ0Ms/X9UnJF8pddY5z4= github.com/shoenig/go-m1cpu v0.1.6 h1:nxdKQNcEB6vzgA2E2bvzKIYRuNj7XNJ4S/aRSwKzFtM= github.com/shoenig/go-m1cpu v0.1.6/go.mod h1:1JJMcUBvfNwpq05QDQVAnx3gUHr9IYF7GNg9SUEw2VQ= github.com/shoenig/test v0.6.4 h1:kVTaSd7WLz5WZ2IaoM0RSzRsUD+m8wRR+5qvntpn4LU= @@ -26,18 +25,17 @@ github.com/stretchr/testify v1.7.1/go.mod h1:6Fq8oRcR53rry900zMqJjRRixrwX3KX962/ github.com/stretchr/testify v1.8.0/go.mod h1:yNjHg4UonilssWZ8iaSj1OCr/vHnekPRkoO+kdMU+MU= github.com/stretchr/testify v1.8.4 h1:CcVxjf3Q8PM0mHUKJCdn+eZZtm5yQwehR5yeSVQQcUk= github.com/stretchr/testify v1.8.4/go.mod h1:sz/lmYIOXD/1dqDmKjjqLyZ2RngseejIcXlSw2iwfAo= -github.com/tklauser/go-sysconf v0.3.12 h1:0QaGUFOdQaIVdPgfITYzaTegZvdCjmYO52cSFAEVmqU= -github.com/tklauser/go-sysconf v0.3.12/go.mod h1:Ho14jnntGE1fpdOqQEEaiKRpvIavV0hSfmBq8nJbHYI= -github.com/tklauser/numcpus v0.6.1 h1:ng9scYS7az0Bk4OZLvrNXNSAO2Pxr1XXRAPyjhIx+Fk= -github.com/tklauser/numcpus v0.6.1/go.mod h1:1XfjsgE2zo8GVw7POkMbHENHzVg3GzmoZ9fESEdAacY= +github.com/tklauser/go-sysconf v0.3.11 h1:89WgdJhk5SNwJfu+GKyYveZ4IaJ7xAkecBo+KdJV0CM= +github.com/tklauser/go-sysconf v0.3.11/go.mod h1:GqXfhXY3kiPa0nAXPDIQIWzJbMCB7AmcWpGR8lSZfqI= +github.com/tklauser/numcpus v0.6.0 h1:kebhY2Qt+3U6RNK7UqpYNA+tJ23IBEGKkB7JQBfDYms= +github.com/tklauser/numcpus v0.6.0/go.mod h1:FEZLMke0lhOUG6w2JadTzp0a+Nl8PF/GFkQ5UVIcaL4= github.com/yusufpapurcu/wmi v1.2.3 h1:E1ctvB7uKFMOJw3fdOW32DwGE9I7t++CRUEMKvFoFiw= github.com/yusufpapurcu/wmi v1.2.3/go.mod h1:SBZ9tNy3G9/m5Oi98Zks0QjeHVDvuK0qfxQmPyzfmi0= golang.org/x/sys v0.0.0-20190916202348-b4ddaad3f8a3/go.mod h1:h1NjWce9XRLGQEsW7wpKNCjG9DtNlClVuFLEZdDNbEs= golang.org/x/sys v0.0.0-20201204225414-ed752295db88/go.mod h1:h1NjWce9XRLGQEsW7wpKNCjG9DtNlClVuFLEZdDNbEs= -golang.org/x/sys v0.8.0/go.mod h1:oPkhp1MJrh7nUepCBck5+mAzfO9JrbApNNgaTdGDITg= -golang.org/x/sys v0.11.0/go.mod h1:oPkhp1MJrh7nUepCBck5+mAzfO9JrbApNNgaTdGDITg= -golang.org/x/sys v0.16.0 h1:xWw16ngr6ZMtmxDyKyIgsE93KNKz5HKmMa3b8ALHidU= -golang.org/x/sys v0.16.0/go.mod h1:/VUhepiaJMQUp4+oa/7Zr1D23ma6VTLIYjOOTFZPUcA= +golang.org/x/sys v0.2.0/go.mod h1:oPkhp1MJrh7nUepCBck5+mAzfO9JrbApNNgaTdGDITg= +golang.org/x/sys v0.10.0 h1:SqMFp9UcQJZa+pmYuAKjd9xq1f0j5rLcDIk0mj4qAsA= +golang.org/x/sys v0.10.0/go.mod h1:oPkhp1MJrh7nUepCBck5+mAzfO9JrbApNNgaTdGDITg= golang.org/x/xerrors v0.0.0-20191204190536-9bdfabe68543/go.mod h1:I/5z698sn9Ka8TeJc9MKroUUfqBBauWjQqLJ2OPfmY0= gopkg.in/check.v1 v0.0.0-20161208181325-20d25e280405/go.mod h1:Co6ibVJAznAaIkqp8huTwlJQCZ016jof/cbN4VW5Yz0= gopkg.in/yaml.v3 v3.0.0-20200313102051-9f266ea9e77c/go.mod h1:K4uyk7z7BCEPqu6E+C64Yfv1cQ7kz7rIZviUmN+EgEM= diff --git a/main.go b/main.go index 8365faa..815d72a 100644 --- a/main.go +++ b/main.go @@ -1,269 +1,16 @@ package main import ( - "encoding/json" "flag" "fmt" - "net/http" "os" - "path/filepath" - "strings" - "time" - "github.com/shirou/gopsutil/v3/cpu" - "github.com/shirou/gopsutil/v3/disk" - "github.com/shirou/gopsutil/v3/mem" + "github.com/appwrite/monitoring/pkg" + "github.com/appwrite/monitoring/pkg/monitor" ) -type Metric struct { - Title string `json:"title"` - Cause string `json:"cause"` - AlertID string `json:"alert_id"` - Timestamp int64 `json:"timestamp"` - Status string `json:"status"` - Value float64 `json:"value"` - Limit float64 `json:"limit"` -} - -type SystemMonitor struct { - httpClient *http.Client - betterStackURL string - hostname string - cpuLimit float64 - memoryLimit float64 - diskLimit float64 - interval int - log *Logger -} - -func NewSystemMonitor(betterStackURL string, interval int, cpuLimit, memoryLimit, diskLimit float64) (*SystemMonitor, error) { - hostname, err := os.Hostname() - if err != nil { - return nil, fmt.Errorf("failed to get hostname: %v", err) - } - - return &SystemMonitor{ - httpClient: &http.Client{ - Timeout: 5 * time.Second, - }, - betterStackURL: betterStackURL, - hostname: hostname, - cpuLimit: cpuLimit, - memoryLimit: memoryLimit, - diskLimit: diskLimit, - interval: interval, - log: New(), - }, nil -} - -func (s *SystemMonitor) checkCPU() error { - duration := float64(s.interval) / 10 - if duration < 5 { - duration = 5 - } - if duration > 60 { - duration = 60 - } - - cpuPercent, err := cpu.Percent(time.Duration(duration)*time.Second, false) - if err != nil { - return fmt.Errorf("failed to get CPU usage: %v", err) - } - - if len(cpuPercent) == 0 { - return nil - } - - value := cpuPercent[0] - status := s.getStatus(value, s.cpuLimit) - if status == "fail" { - s.log.Warn("CPU usage %.2f%% exceeds limit of %.2f%%", value, s.cpuLimit) - } else { - s.log.Log("CPU usage: %.2f%% (limit: %.2f%%)", value, s.cpuLimit) - } - - metric := Metric{ - Title: fmt.Sprintf("CPU Usage - %s", s.hostname), - Cause: "CPU monitoring check", - AlertID: fmt.Sprintf("cpu-%s", s.hostname), - Timestamp: time.Now().Unix(), - Status: status, - Value: value, - Limit: s.cpuLimit, - } - - return s.sendMetric(metric) -} - -func (s *SystemMonitor) checkMemory() error { - vmStat, err := mem.VirtualMemory() - if err != nil { - return fmt.Errorf("failed to get memory stats: %v", err) - } - - value := vmStat.UsedPercent - status := s.getStatus(value, s.memoryLimit) - if status == "fail" { - s.log.Warn("Memory usage %.2f%% exceeds limit of %.2f%%", value, s.memoryLimit) - } else { - s.log.Log("Memory usage: %.2f%% (limit: %.2f%%), Available: %d MB, Total: %d MB", - value, - s.memoryLimit, - vmStat.Available/(1024*1024), - vmStat.Total/(1024*1024)) - } - - metric := Metric{ - Title: fmt.Sprintf("Memory Usage - %s", s.hostname), - Cause: "Memory monitoring check", - AlertID: fmt.Sprintf("memory-%s", s.hostname), - Timestamp: time.Now().Unix(), - Status: status, - Value: value, - Limit: s.memoryLimit, - } - - return s.sendMetric(metric) -} - -func (s *SystemMonitor) checkDisk() error { - // Check root partition - usage, err := disk.Usage("/") - if err != nil { - return fmt.Errorf("failed to get disk usage: %v", err) - } - - value := usage.UsedPercent - status := s.getStatus(value, s.diskLimit) - if status == "fail" { - s.log.Warn("Root disk usage %.2f%% exceeds limit of %.2f%%", value, s.diskLimit) - } else { - s.log.Log("Root disk usage: %.2f%% (limit: %.2f%%), Free: %d MB, Total: %d MB", - value, - s.diskLimit, - usage.Free/(1024*1024), - usage.Total/(1024*1024)) - } - - if err := s.sendMetric(Metric{ - Title: fmt.Sprintf("Root Disk Usage - %s", s.hostname), - Cause: "Disk monitoring check", - AlertID: fmt.Sprintf("disk-root-%s", s.hostname), - Timestamp: time.Now().Unix(), - Status: status, - Value: value, - Limit: s.diskLimit, - }); err != nil { - return err - } - - // Check mounted directories - mounts, err := filepath.Glob("/mnt/*") - if err != nil { - return fmt.Errorf("failed to list mounted directories: %v", err) - } - - for _, mount := range mounts { - usage, err := disk.Usage(mount) - if err != nil { - s.log.Error("Failed to get disk usage for %s: %v", mount, err) - continue - } - - value := usage.UsedPercent - status := s.getStatus(value, s.diskLimit) - if status == "fail" { - s.log.Warn("Disk usage for %s %.2f%% exceeds limit of %.2f%%", mount, value, s.diskLimit) - } else { - s.log.Log("Disk usage for %s: %.2f%% (limit: %.2f%%), Free: %d MB, Total: %d MB", - mount, - value, - s.diskLimit, - usage.Free/(1024*1024), - usage.Total/(1024*1024)) - } - - if err := s.sendMetric(Metric{ - Title: fmt.Sprintf("Disk Usage %s - %s", mount, s.hostname), - Cause: "Disk monitoring check", - AlertID: fmt.Sprintf("disk-%s-%s", filepath.Base(mount), s.hostname), - Timestamp: time.Now().Unix(), - Status: status, - Value: value, - Limit: s.diskLimit, - }); err != nil { - return err - } - } - - return nil -} - -func (s *SystemMonitor) getStatus(value, limit float64) string { - if value > limit { - return "fail" - } - return "pass" -} - -func (s *SystemMonitor) sendMetric(metric Metric) error { - body, err := json.Marshal(metric) - if err != nil { - return fmt.Errorf("failed to marshal metric: %v", err) - } - - req, err := http.NewRequest(http.MethodPost, s.betterStackURL, strings.NewReader(string(body))) - if err != nil { - return fmt.Errorf("failed to create request: %v", err) - } - - req.Header.Set("Content-Type", "application/json; charset=utf-8") - req.Header.Set("Accept", "application/json") - req.Header.Set("User-Agent", "Appwrite Resource Monitoring") - - resp, err := s.httpClient.Do(req) - if err != nil { - return fmt.Errorf("failed to send request: %v", err) - } - defer resp.Body.Close() - - s.log.Log("Response Status: %s", resp.Status) - if resp.StatusCode >= 400 { - return fmt.Errorf("request failed with status: %d", resp.StatusCode) - } - - return nil -} - -func (s *SystemMonitor) Start() { - ticker := time.NewTicker(time.Duration(s.interval) * time.Second) - defer ticker.Stop() - - // Initial check - s.runChecks() - - // Periodic checks - for range ticker.C { - s.runChecks() - } -} - -func (s *SystemMonitor) runChecks() { - if err := s.checkCPU(); err != nil { - s.log.Error("Error checking CPU: %v", err) - } - - if err := s.checkMemory(); err != nil { - s.log.Error("Error checking memory: %v", err) - } - - if err := s.checkDisk(); err != nil { - s.log.Error("Error checking disk: %v", err) - } -} - func main() { - log := New() + log := pkg.NewLogger() // Command line flags betterStackURL := flag.String("url", "", "BetterStack webhook URL (required)") @@ -300,7 +47,7 @@ func main() { log.Fatal("Disk limit must be between 0 and 100") } - monitor, err := NewSystemMonitor(*betterStackURL, *interval, *cpuLimit, *memoryLimit, *diskLimit) + monitor, err := monitor.NewSystemMonitor(*betterStackURL, *interval, *cpuLimit, *memoryLimit, *diskLimit) if err != nil { log.Fatal("Failed to create system monitor: %v", err) } diff --git a/main_test.go b/main_test.go new file mode 100644 index 0000000..f037629 --- /dev/null +++ b/main_test.go @@ -0,0 +1,266 @@ +package main + +import ( + "fmt" + "math" + "testing" +) + +func TestEMACalculation(t *testing.T) { + // For testing, we'll use our own EMA calculation to simulate the internal behavior + var emaValue float64 = 0.0 + alpha := 0.333 // This matches the calculation in monitor for 60-second interval + + // Helper function to calculate EMA to match monitor's internal calculation + calculateEMA := func(currentValue, previousEMA float64) float64 { + return alpha*currentValue + (1-alpha)*previousEMA + } + + // Test case 1: Steady high usage + t.Run("SteadyHighUsage", func(t *testing.T) { + // Simulate 10 measurements of 95% CPU usage + emaValue = 0.0 + for i := 0; i < 10; i++ { + emaValue = calculateEMA(95.0, emaValue) + } + + // EMA should be close to 95% after 10 measurements + if emaValue < 93.0 || emaValue > 96.0 { + t.Errorf("Expected EMA to be around 95%% after steady high usage, got %.2f%%", emaValue) + } + }) + + // Test case 2: Single spike + t.Run("SingleSpike", func(t *testing.T) { + // Reset EMA + emaValue = 50.0 + + // Single spike to 100% + emaValue = calculateEMA(100.0, emaValue) + + // EMA should be much lower than the spike + if emaValue > 70.0 { + t.Errorf("Expected EMA to be significantly lower than spike (100%%), got %.2f%%", emaValue) + } + }) + + // Test case 3: Gradual increase + t.Run("GradualIncrease", func(t *testing.T) { + // Reset EMA + emaValue = 50.0 + + // Simulate gradual increase from 50% to 90% + values := []float64{60.0, 70.0, 80.0, 90.0} + for _, v := range values { + emaValue = calculateEMA(v, emaValue) + } + + // EMA should be between 70% and 90% + if emaValue < 70.0 || emaValue > 90.0 { + t.Errorf("Expected EMA to be between 70%% and 90%% after gradual increase, got %.2f%%", emaValue) + } + }) +} + +func TestEMASmoothing(t *testing.T) { + // For testing, we'll use our own EMA calculation to simulate the internal behavior + var emaValue float64 = 50.0 + alpha := 0.333 // This matches the calculation in monitor for 60-second interval + + // Helper function to calculate EMA to match monitor's internal calculation + calculateEMA := func(currentValue, previousEMA float64) float64 { + return alpha*currentValue + (1-alpha)*previousEMA + } + + // Test case: Alternating high and low values + t.Run("AlternatingValues", func(t *testing.T) { + // Reset EMA + emaValue = 50.0 + + // Simulate alternating between 20% and 100% + for i := 0; i < 20; i++ { + var value float64 + if i%2 == 0 { + value = 20.0 + } else { + value = 100.0 + } + emaValue = calculateEMA(value, emaValue) + } + + // EMA should be around 60% (smoothing out the extremes) + if emaValue < 50.0 || emaValue > 70.0 { + t.Errorf("Expected EMA to be around 60%% after alternating values, got %.2f%%", emaValue) + } + }) +} + +func TestEMAResponseTime(t *testing.T) { + // For testing, we'll use our own EMA calculation to simulate the internal behavior + var emaValue float64 = 50.0 + alpha := 0.333 // This matches the calculation in monitor for 60-second interval + + // Helper function to calculate EMA to match monitor's internal calculation + calculateEMA := func(currentValue, previousEMA float64) float64 { + return alpha*currentValue + (1-alpha)*previousEMA + } + + // Test case: Measure how quickly EMA responds to sustained change + t.Run("ResponseTime", func(t *testing.T) { + // Reset EMA + emaValue = 50.0 + + // Simulate sustained high usage + measurements := 0 + for emaValue < 80.0 { + emaValue = calculateEMA(100.0, emaValue) + measurements++ + + // Prevent infinite loop + if measurements > 20 { + t.Fatal("EMA took too long to respond to sustained high usage") + } + } + + t.Logf("EMA reached 80%% after %d measurements", measurements) + }) +} + +func TestEMADifferentIntervals(t *testing.T) { + // Test different intervals to verify alpha calculation + testCases := []struct { + interval int + expectedAlpha float64 + }{ + {60, 0.333}, // 5 minutes = 5 periods + {30, 0.182}, // 5 minutes = 10 periods + {15, 0.095}, // 5 minutes = 20 periods + } + + for _, tc := range testCases { + t.Run(fmt.Sprintf("Interval%d", tc.interval), func(t *testing.T) { + // For each interval, we need to calculate what the alpha should be + N := float64(300) / float64(tc.interval) + alpha := 2.0 / (N + 1.0) + + // Allow for small floating point differences + if math.Abs(alpha - tc.expectedAlpha) > 0.001 { + t.Errorf("Expected alpha %.3f for interval %d, got %.3f", + tc.expectedAlpha, tc.interval, alpha) + } + }) + } +} + +func TestDiskEMAs(t *testing.T) { + // For testing, we'll use our own EMA calculation to simulate the internal behavior + alpha := 0.333 // This matches the calculation in monitor for 60-second interval + + // Helper function to calculate EMA to match monitor's internal calculation + calculateEMA := func(currentValue, previousEMA float64) float64 { + return alpha*currentValue + (1-alpha)*previousEMA + } + + // Test case 1: Verify root disk EMA updates correctly + t.Run("RootDiskEMA", func(t *testing.T) { + rootPath := "/" + diskEMAs := make(map[string]float64) + + // Simulate initial usage + diskEMAs[rootPath] = 50.0 + + // Simulate a series of measurements + values := []float64{60.0, 70.0, 80.0, 90.0} + for _, v := range values { + diskEMAs[rootPath] = calculateEMA(v, diskEMAs[rootPath]) + } + + // EMA should be between 70% and 90% + if diskEMAs[rootPath] < 70.0 || diskEMAs[rootPath] > 90.0 { + t.Errorf("Expected root disk EMA to be between 70%% and 90%%, got %.2f%%", diskEMAs[rootPath]) + } + }) + + // Test case 2: Verify multiple mount points are tracked independently + t.Run("MultipleMountEMAs", func(t *testing.T) { + diskEMAs := make(map[string]float64) + + // Setup test mounts with different starting values + mounts := map[string]float64{ + "/mnt/data1": 30.0, + "/mnt/data2": 50.0, + "/mnt/logs": 70.0, + } + + // Initialize starting values + for path, value := range mounts { + diskEMAs[path] = value + } + + // Apply the same change to all mounts (+20%) + for path := range mounts { + currentValue := diskEMAs[path] + newValue := currentValue + 20.0 + if newValue > 100.0 { + newValue = 100.0 + } + diskEMAs[path] = calculateEMA(newValue, currentValue) + } + + // Verify each mount's EMA updated independently + for path, initialValue := range mounts { + expectedMinimum := initialValue + expectedMaximum := initialValue + 20.0 // Full change would be +20% + if expectedMaximum > 100.0 { + expectedMaximum = 100.0 + } + + // With our alpha, the EMA should be approximately between the initial value and the initial + 7% (1/3 of 20%) + expectedMinimum = initialValue + expectedMaximum = initialValue + 7.0 + + actualValue := diskEMAs[path] + if actualValue < expectedMinimum || actualValue > expectedMaximum { + t.Errorf("Mount %s: Expected EMA between %.2f%% and %.2f%%, got %.2f%%", + path, expectedMinimum, expectedMaximum, actualValue) + } + } + + // Extract just the mount values we're testing + mountValues := make([]float64, 0, len(mounts)) + for path := range mounts { + mountValues = append(mountValues, diskEMAs[path]) + } + + // Verify mount points have different values (they're independent) + if len(unique(mountValues)) != len(mounts) { + t.Errorf("Expected all mount EMAs to be different, got: %v", mountValues) + } + }) +} + +// Helper functions for the disk EMA tests + +// Return values from a map as a slice +func getValues(m map[string]float64) []float64 { + values := make([]float64, 0, len(m)) + for _, v := range m { + values = append(values, v) + } + return values +} + +// Return unique values from a slice +func unique(values []float64) []float64 { + seen := make(map[float64]bool) + unique := make([]float64, 0) + + for _, v := range values { + if !seen[v] { + seen[v] = true + unique = append(unique, v) + } + } + + return unique +} \ No newline at end of file diff --git a/logger.go b/pkg/logger.go similarity index 97% rename from logger.go rename to pkg/logger.go index 45c20e4..1759df9 100644 --- a/logger.go +++ b/pkg/logger.go @@ -1,4 +1,4 @@ -package main +package pkg import ( "fmt" @@ -21,7 +21,7 @@ type Logger struct { logger *log.Logger } -func New() *Logger { +func NewLogger() *Logger { return &Logger{ logger: log.New(os.Stdout, "", 0), } diff --git a/pkg/monitor/cpu.go b/pkg/monitor/cpu.go new file mode 100644 index 0000000..9364279 --- /dev/null +++ b/pkg/monitor/cpu.go @@ -0,0 +1,51 @@ +package monitor + +import ( + "fmt" + "time" + + "github.com/shirou/gopsutil/v3/cpu" +) + +// CheckCPU monitors CPU usage and sends metrics +func (s *SystemMonitor) CheckCPU() error { + duration := float64(s.interval) / 10 + if duration < 5 { + duration = 5 + } + if duration > 60 { + duration = 60 + } + + cpuPercent, err := cpu.Percent(time.Duration(duration)*time.Second, false) + if err != nil { + return fmt.Errorf("failed to get CPU usage: %v", err) + } + + if len(cpuPercent) == 0 { + return nil + } + + // Calculate EMA for CPU usage + instantValue := cpuPercent[0] + s.cpuEMA = s.calculateEMA(instantValue, s.cpuEMA) + + status := s.getStatus(s.cpuEMA, s.cpuLimit) + if status == "fail" { + s.log.Warn("CPU usage EMA %.2f%% exceeds limit of %.2f%% (instant: %.2f%%)", s.cpuEMA, s.cpuLimit, instantValue) + } else { + s.log.Log("CPU usage EMA: %.2f%% (limit: %.2f%%, instant: %.2f%%)", s.cpuEMA, s.cpuLimit, instantValue) + } + + metric := Metric{ + Title: fmt.Sprintf("CPU Usage - %s", s.hostname), + Cause: "CPU monitoring check", + AlertID: fmt.Sprintf("cpu-%s", s.hostname), + Timestamp: time.Now().Unix(), + Status: status, + Value: s.cpuEMA, + Limit: s.cpuLimit, + } + + return s.sendMetric(metric) +} \ No newline at end of file diff --git a/pkg/monitor/disk.go b/pkg/monitor/disk.go new file mode 100644 index 0000000..bcfc306 --- /dev/null +++ b/pkg/monitor/disk.go @@ -0,0 +1,110 @@ +package monitor + +import ( + "fmt" + "path/filepath" + "time" + + "github.com/shirou/gopsutil/v3/disk" +) + +// CheckDisk monitors disk usage for root and mounted partitions +func (s *SystemMonitor) CheckDisk() error { + // Check root partition + rootPath := "/" + usage, err := disk.Usage(rootPath) + if err != nil { + return fmt.Errorf("failed to get disk usage: %v", err) + } + + instantValue := usage.UsedPercent + + // Calculate or update EMA for root disk + if _, exists := s.diskEMAs[rootPath]; !exists { + // Initialize EMA with current value if this is first check + s.diskEMAs[rootPath] = instantValue + } + + // Update EMA for root disk + s.diskEMAs[rootPath] = s.calculateEMA(instantValue, s.diskEMAs[rootPath]) + + rootEMA := s.diskEMAs[rootPath] + status := s.getStatus(rootEMA, s.diskLimit) + + if status == "fail" { + s.log.Warn("Root disk usage EMA %.2f%% exceeds limit of %.2f%% (instant: %.2f%%)", rootEMA, s.diskLimit, instantValue) + } else { + s.log.Log("Root disk usage EMA: %.2f%% (limit: %.2f%%, instant: %.2f%%), Free: %d MB, Total: %d MB", + rootEMA, + s.diskLimit, + instantValue, + usage.Free/(1024*1024), + usage.Total/(1024*1024)) + } + + if err := s.sendMetric(Metric{ + Title: fmt.Sprintf("Root Disk Usage - %s", s.hostname), + Cause: "Disk monitoring check", + AlertID: fmt.Sprintf("disk-root-%s", s.hostname), + Timestamp: time.Now().Unix(), + Status: status, + Value: rootEMA, + Limit: s.diskLimit, + }); err != nil { + return err + } + + // Check mounted directories + mounts, err := filepath.Glob("/mnt/*") + if err != nil { + return fmt.Errorf("failed to list mounted directories: %v", err) + } + + for _, mount := range mounts { + usage, err := disk.Usage(mount) + if err != nil { + s.log.Error("Failed to get disk usage for %s: %v", mount, err) + continue + } + + instantValue := usage.UsedPercent + + // Calculate or update EMA for this mount + if _, exists := s.diskEMAs[mount]; !exists { + // Initialize EMA with current value if this is first check + s.diskEMAs[mount] = instantValue + } + + // Update EMA for this mount + s.diskEMAs[mount] = s.calculateEMA(instantValue, s.diskEMAs[mount]) + + mountEMA := s.diskEMAs[mount] + status := s.getStatus(mountEMA, s.diskLimit) + + if status == "fail" { + s.log.Warn("Disk usage for %s EMA %.2f%% exceeds limit of %.2f%% (instant: %.2f%%)", mount, mountEMA, s.diskLimit, instantValue) + } else { + s.log.Log("Disk usage for %s EMA: %.2f%% (limit: %.2f%%, instant: %.2f%%), Free: %d MB, Total: %d MB", + mount, + mountEMA, + s.diskLimit, + instantValue, + usage.Free/(1024*1024), + usage.Total/(1024*1024)) + } + + if err := s.sendMetric(Metric{ + Title: fmt.Sprintf("Disk Usage %s - %s", mount, s.hostname), + Cause: "Disk monitoring check", + AlertID: fmt.Sprintf("disk-%s-%s", filepath.Base(mount), s.hostname), + Timestamp: time.Now().Unix(), + Status: status, + Value: mountEMA, + Limit: s.diskLimit, + }); err != nil { + return err + } + } + + return nil +} \ No newline at end of file diff --git a/pkg/monitor/memory.go b/pkg/monitor/memory.go new file mode 100644 index 0000000..fb6a18d --- /dev/null +++ b/pkg/monitor/memory.go @@ -0,0 +1,43 @@ +package monitor + +import ( + "fmt" + "time" + + "github.com/shirou/gopsutil/v3/mem" +) + +// CheckMemory monitors memory usage and sends metrics +func (s *SystemMonitor) CheckMemory() error { + vmStat, err := mem.VirtualMemory() + if err != nil { + return fmt.Errorf("failed to get memory stats: %v", err) + } + + instantValue := vmStat.UsedPercent + s.memoryEMA = s.calculateEMA(instantValue, s.memoryEMA) + + status := s.getStatus(s.memoryEMA, s.memoryLimit) + if status == "fail" { + s.log.Warn("Memory usage EMA %.2f%% exceeds limit of %.2f%% (instant: %.2f%%)", s.memoryEMA, s.memoryLimit, instantValue) + } else { + s.log.Log("Memory usage EMA: %.2f%% (limit: %.2f%%, instant: %.2f%%), Available: %d MB, Total: %d MB", + s.memoryEMA, + s.memoryLimit, + instantValue, + vmStat.Available/(1024*1024), + vmStat.Total/(1024*1024)) + } + + metric := Metric{ + Title: fmt.Sprintf("Memory Usage - %s", s.hostname), + Cause: "Memory monitoring check", + AlertID: fmt.Sprintf("memory-%s", s.hostname), + Timestamp: time.Now().Unix(), + Status: status, + Value: s.memoryEMA, + Limit: s.memoryLimit, + } + + return s.sendMetric(metric) +} \ No newline at end of file diff --git a/pkg/monitor/monitor.go b/pkg/monitor/monitor.go new file mode 100644 index 0000000..e239bd4 --- /dev/null +++ b/pkg/monitor/monitor.go @@ -0,0 +1,142 @@ +package monitor + +import ( + "encoding/json" + "fmt" + "net/http" + "os" + "strings" + "time" + + "github.com/appwrite/monitoring/pkg" +) + +// Metric represents a monitoring metric to be sent to BetterStack +type Metric struct { + Title string `json:"title"` + Cause string `json:"cause"` + AlertID string `json:"alert_id"` + Timestamp int64 `json:"timestamp"` + Status string `json:"status"` + Value float64 `json:"value"` + Limit float64 `json:"limit"` +} + +// SystemMonitor handles system resource monitoring and alerts +type SystemMonitor struct { + httpClient *http.Client + betterStackURL string + hostname string + cpuLimit float64 + memoryLimit float64 + diskLimit float64 + interval int + log *pkg.Logger + + // EMA tracking + cpuEMA float64 + memoryEMA float64 + diskEMAs map[string]float64 // Map to track EMAs for all disks (root and mounted) + alpha float64 // EMA smoothing factor +} + +// NewSystemMonitor creates a new system monitor instance +func NewSystemMonitor(betterStackURL string, interval int, cpuLimit, memoryLimit, diskLimit float64) (*SystemMonitor, error) { + hostname, err := os.Hostname() + if err != nil { + return nil, fmt.Errorf("failed to get hostname: %v", err) + } + + // Calculate alpha based on interval to get roughly 5 minutes of smoothing + // EMA formula: alpha = 2/(N+1) where N is the number of periods + // For 5 minutes of smoothing with our interval: N = 300/interval + N := float64(300) / float64(interval) + alpha := 2.0 / (N + 1.0) + + return &SystemMonitor{ + httpClient: &http.Client{ + Timeout: 5 * time.Second, + }, + betterStackURL: betterStackURL, + hostname: hostname, + cpuLimit: cpuLimit, + memoryLimit: memoryLimit, + diskLimit: diskLimit, + interval: interval, + log: pkg.NewLogger(), + diskEMAs: make(map[string]float64), // Initialize the map for all disk EMAs + alpha: alpha, + }, nil +} + +// calculateEMA calculates the exponential moving average +func (s *SystemMonitor) calculateEMA(currentValue, previousEMA float64) float64 { + return s.alpha*currentValue + (1-s.alpha)*previousEMA +} + +// getStatus determines if a metric is passing or failing +func (s *SystemMonitor) getStatus(value, limit float64) string { + if value > limit { + return "fail" + } + return "pass" +} + +// sendMetric sends a metric to the BetterStack monitoring endpoint +func (s *SystemMonitor) sendMetric(metric Metric) error { + body, err := json.Marshal(metric) + if err != nil { + return fmt.Errorf("failed to marshal metric: %v", err) + } + + req, err := http.NewRequest(http.MethodPost, s.betterStackURL, strings.NewReader(string(body))) + if err != nil { + return fmt.Errorf("failed to create request: %v", err) + } + + req.Header.Set("Content-Type", "application/json; charset=utf-8") + req.Header.Set("Accept", "application/json") + req.Header.Set("User-Agent", "Appwrite Resource Monitoring") + + resp, err := s.httpClient.Do(req) + if err != nil { + return fmt.Errorf("failed to send request: %v", err) + } + defer resp.Body.Close() + + s.log.Log("Response Status: %s", resp.Status) + if resp.StatusCode >= 400 { + return fmt.Errorf("request failed with status: %d", resp.StatusCode) + } + + return nil +} + +// Start begins the monitoring process +func (s *SystemMonitor) Start() { + ticker := time.NewTicker(time.Duration(s.interval) * time.Second) + defer ticker.Stop() + + // Initial check + s.runChecks() + + // Periodic checks + for range ticker.C { + s.runChecks() + } +} + +// runChecks executes all monitoring checks +func (s *SystemMonitor) runChecks() { + if err := s.CheckCPU(); err != nil { + s.log.Error("Error checking CPU: %v", err) + } + + if err := s.CheckMemory(); err != nil { + s.log.Error("Error checking memory: %v", err) + } + + if err := s.CheckDisk(); err != nil { + s.log.Error("Error checking disk: %v", err) + } +} \ No newline at end of file