diff --git a/BCC_TRACING.md b/BCC_TRACING.md new file mode 100644 index 0000000..466acad --- /dev/null +++ b/BCC_TRACING.md @@ -0,0 +1,298 @@ +# BCC-Style eBPF Tracing Implementation + +## Overview + +This implementation adds powerful BCC-style (Berkeley Packet Filter Compiler) tracing capabilities to the diagnostic agent, similar to the `trace.py` tool from the iovisor BCC project. Instead of just filtering events, this system actually counts and traces real system calls with detailed argument parsing. + +## Key Features + +### 1. Real System Call Tracing +- **Actual event counting**: Unlike the previous implementation that just simulated events, this captures real system calls +- **Argument extraction**: Extracts function arguments (arg1, arg2, etc.) and return values +- **Multiple probe types**: Supports kprobes, kretprobes, tracepoints, and uprobes +- **Filtering capabilities**: Filter by process name, PID, UID, argument values + +### 2. BCC-Style Syntax +Supports familiar BCC trace.py syntax patterns: +```bash +# Simple syscall tracing +"sys_open" # Trace open syscalls +"sys_read (arg3 > 1024)" # Trace reads >1024 bytes +"r::sys_open" # Return probe on open + +# With format strings +"sys_write \"wrote %d bytes\", arg3" +"sys_open \"opening %s\", arg2@user" +``` + +### 3. Comprehensive Event Data +Each trace captures: +```json +{ + "timestamp": 1234567890, + "pid": 1234, + "tid": 1234, + "process_name": "nginx", + "function": "__x64_sys_openat", + "message": "opening file: /var/log/access.log", + "raw_args": { + "arg1": "3", + "arg2": "/var/log/access.log", + "arg3": "577" + } +} +``` + +## Architecture + +### Core Components + +1. **BCCTraceManager** (`ebpf_trace_manager.go`) + - Main orchestrator for BCC-style tracing + - Generates bpftrace scripts dynamically + - Manages trace sessions and event collection + +2. **TraceSpec** - Trace specification format + ```go + type TraceSpec struct { + ProbeType string // "p", "r", "t", "u" + Target string // Function/syscall to trace + Format string // Output format string + Arguments []string // Arguments to extract + Filter string // Filter conditions + Duration int // Trace duration in seconds + ProcessName string // Process filter + PID int // Process ID filter + UID int // User ID filter + } + ``` + +3. **EventScanner** (`ebpf_event_parser.go`) + - Parses bpftrace output in real-time + - Converts raw trace data to structured events + - Handles argument extraction and enrichment + +4. **TraceSpecBuilder** - Fluent API for building specs + ```go + spec := NewTraceSpecBuilder(). + Kprobe("__x64_sys_write"). + Format("write %d bytes to fd %d", "arg3", "arg1"). + Filter("arg1 == 1"). + Duration(30). + Build() + ``` + +## Usage Examples + +### 1. Basic System Call Tracing + +```go +// Trace file open operations +spec := TraceSpec{ + ProbeType: "p", + Target: "__x64_sys_openat", + Format: "opening file: %s", + Arguments: []string{"arg2@user"}, + Duration: 30, +} + +traceID, err := manager.StartTrace(spec) +``` + +### 2. Filtered Tracing + +```go +// Trace only large reads +spec := TraceSpec{ + ProbeType: "p", + Target: "__x64_sys_read", + Format: "read %d bytes from fd %d", + Arguments: []string{"arg3", "arg1"}, + Filter: "arg3 > 1024", + Duration: 30, +} +``` + +### 3. Process-Specific Tracing + +```go +// Trace only nginx processes +spec := TraceSpec{ + ProbeType: "p", + Target: "__x64_sys_write", + ProcessName: "nginx", + Duration: 60, +} +``` + +### 4. Return Value Tracing + +```go +// Trace return values from file operations +spec := TraceSpec{ + ProbeType: "r", + Target: "__x64_sys_openat", + Format: "open returned: %d", + Arguments: []string{"retval"}, + Duration: 30, +} +``` + +## Integration with Agent + +### API Request Format +The remote API can send trace specifications in the `ebpf_programs` field: + +```json +{ + "commands": [ + {"id": "cmd1", "command": "ps aux"} + ], + "ebpf_programs": [ + { + "name": "file_monitoring", + "type": "kprobe", + "target": "sys_open", + "duration": 30, + "filters": {"process": "nginx"}, + "description": "Monitor file access by nginx" + } + ] +} +``` + +### Agent Response Format +The agent returns detailed trace results: + +```json +{ + "name": "__x64_sys_openat", + "type": "bcc_trace", + "target": "__x64_sys_openat", + "duration": 30, + "status": "completed", + "success": true, + "event_count": 45, + "events": [ + { + "timestamp": 1234567890, + "pid": 1234, + "process_name": "nginx", + "function": "__x64_sys_openat", + "message": "opening file: /var/log/access.log", + "raw_args": {"arg1": "3", "arg2": "/var/log/access.log"} + } + ], + "statistics": { + "total_events": 45, + "events_per_second": 1.5, + "top_processes": [ + {"process_name": "nginx", "event_count": 30}, + {"process_name": "apache", "event_count": 15} + ] + } +} +``` + +## Test Specifications + +The implementation includes test specifications for unit testing: + +- **test_sys_open**: File open operations +- **test_sys_read**: Read operations with filters +- **test_sys_write**: Write operations +- **test_process_creation**: Process execution +- **test_kretprobe**: Return value tracing +- **test_with_filter**: Filtered tracing + +## Running Tests + +```bash +# Run all BCC tracing tests +go test -v -run TestBCCTracing + +# Test trace manager capabilities +go test -v -run TestTraceManagerCapabilities + +# Test syscall suggestions +go test -v -run TestSyscallSuggestions + +# Run all tests +go test -v +``` + +## Requirements + +### System Requirements +- **Linux kernel 4.4+** with eBPF support +- **bpftrace** installed (`apt install bpftrace`) +- **Root privileges** for actual tracing + +### Checking Capabilities +The trace manager automatically detects capabilities: + +```bash +$ go test -run TestTraceManagerCapabilities +๐Ÿ”ง Trace Manager Capabilities: + โœ… kernel_ebpf: Available + โœ… bpftrace: Available + โŒ root_access: Not Available + โŒ debugfs_access: Not Available +``` + +## Advanced Features + +### 1. Syscall Suggestions +The system can suggest appropriate syscalls based on issue descriptions: + +```go +suggestions := SuggestSyscallTargets("file not found error") +// Returns: ["test_sys_open", "test_sys_read", "test_sys_write", "test_sys_unlink"] +``` + +### 2. BCC-Style Parsing +Parse BCC trace.py style specifications: + +```go +parser := NewTraceSpecParser() +spec, err := parser.ParseFromBCCStyle("sys_write (arg1 == 1) \"stdout: %d bytes\", arg3") +``` + +### 3. Event Filtering and Aggregation +Post-processing capabilities for trace events: + +```go +filter := &TraceEventFilter{ + ProcessNames: []string{"nginx", "apache"}, + MinTimestamp: startTime, +} +filteredEvents := filter.ApplyFilter(events) + +aggregator := NewTraceEventAggregator(events) +topProcesses := aggregator.GetTopProcesses(5) +eventRate := aggregator.GetEventRate() +``` + +## Performance Considerations + +- **Short durations**: Test specs use 5-second durations for quick testing +- **Efficient parsing**: Event scanner processes bpftrace output in real-time +- **Memory management**: Events are processed and aggregated efficiently +- **Timeout handling**: Automatic cleanup of hanging trace sessions + +## Security Considerations + +- **Root privileges required**: eBPF tracing requires root access +- **Resource limits**: Maximum trace duration of 10 minutes +- **Process isolation**: Each trace runs in its own context +- **Automatic cleanup**: Traces are automatically stopped and cleaned up + +## Future Enhancements + +1. **USDT probe support**: Add support for user-space tracing +2. **BTF integration**: Use BPF Type Format for better type information +3. **Flame graph generation**: Generate performance flame graphs +4. **Custom eBPF programs**: Allow uploading custom eBPF bytecode +5. **Distributed tracing**: Correlation across multiple hosts + +This implementation provides a solid foundation for advanced system introspection and debugging, bringing the power of BCC-style tracing to the diagnostic agent. \ No newline at end of file diff --git a/agent.go b/agent.go index 990464a..fe63d3b 100644 --- a/agent.go +++ b/agent.go @@ -11,6 +11,9 @@ import ( "sync" "time" + "nannyagentv2/internal/logging" + "nannyagentv2/internal/types" + "github.com/sashabaranov/go-openai" ) @@ -25,28 +28,6 @@ type DiagnosticResponse struct { ConfidenceLevel float64 `json:"confidence_level"` } -// EBPFRequest represents a request for eBPF program execution -type EBPFRequest struct { - Name string `json:"name"` - Type string `json:"type"` - Target string `json:"target"` - Duration int `json:"duration"` - Filters map[string]string `json:"filters,omitempty"` - Description string `json:"description"` -} - -// EBPFEnhancedDiagnosticResponse represents the enhanced diagnostic response with eBPF -type EBPFEnhancedDiagnosticResponse struct { - ResponseType string `json:"response_type"` - Phase string `json:"phase"` - Analysis string `json:"analysis"` - Commands []string `json:"commands"` - EBPFPrograms []EBPFRequest `json:"ebpf_programs"` - NextSteps []string `json:"next_steps"` - Reasoning string `json:"reasoning"` - ConfidenceLevel float64 `json:"confidence_level"` -} - // ResolutionResponse represents the resolution phase response from AI type ResolutionResponse struct { ResponseType string `json:"response_type"` @@ -93,6 +74,7 @@ type LinuxDiagnosticAgent struct { episodeID string // TensorZero episode ID for conversation continuity ebpfManager *BCCTraceManager // BCC-style eBPF tracing capabilities config *AgentConfig // Configuration for concurrent execution + authManager interface{} // Authentication manager for TensorZero requests } // NewLinuxDiagnosticAgent creates a new diagnostic agent @@ -100,14 +82,14 @@ func NewLinuxDiagnosticAgent() *LinuxDiagnosticAgent { // Get Supabase project URL for TensorZero proxy supabaseURL := os.Getenv("SUPABASE_PROJECT_URL") if supabaseURL == "" { - fmt.Printf("Warning: SUPABASE_PROJECT_URL not set, TensorZero integration will not work\n") + logging.Warning("SUPABASE_PROJECT_URL not set, TensorZero integration will not work") supabaseURL = "https://gpqzsricripnvbrpsyws.supabase.co" // fallback } model := os.Getenv("NANNYAPI_MODEL") if model == "" { model = "tensorzero::function_name::diagnose_and_heal" - fmt.Printf("Warning: Using default model '%s'. Set NANNYAPI_MODEL environment variable for your specific function.\n", model) + logging.Warning("Using default model '%s'. Set NANNYAPI_MODEL environment variable for your specific function", model) } // Note: We don't use the OpenAI client anymore, we use direct HTTP to Supabase proxy @@ -124,10 +106,40 @@ func NewLinuxDiagnosticAgent() *LinuxDiagnosticAgent { return agent } +// NewLinuxDiagnosticAgentWithAuth creates a new diagnostic agent with authentication +func NewLinuxDiagnosticAgentWithAuth(authManager interface{}) *LinuxDiagnosticAgent { + // Get Supabase project URL for TensorZero proxy + supabaseURL := os.Getenv("SUPABASE_PROJECT_URL") + if supabaseURL == "" { + logging.Warning("SUPABASE_PROJECT_URL not set, TensorZero integration will not work") + supabaseURL = "https://gpqzsricripnvbrpsyws.supabase.co" // fallback + } + + model := os.Getenv("NANNYAPI_MODEL") + if model == "" { + model = "tensorzero::function_name::diagnose_and_heal" + logging.Warning("Using default model '%s'. Set NANNYAPI_MODEL environment variable for your specific function", model) + } + + // Note: We don't use the OpenAI client anymore, we use direct HTTP to Supabase proxy + agent := &LinuxDiagnosticAgent{ + client: nil, // Not used anymore + model: model, + executor: NewCommandExecutor(10 * time.Second), // 10 second timeout for commands + config: DefaultAgentConfig(), // Default concurrent execution config + authManager: authManager, // Store auth manager for TensorZero requests + } + + // Initialize BCC-style eBPF capabilities + agent.ebpfManager = NewBCCTraceManager() + + return agent +} + // DiagnoseIssue starts the diagnostic process for a given issue func (a *LinuxDiagnosticAgent) DiagnoseIssue(issue string) error { - fmt.Printf("Diagnosing issue: %s\n", issue) - fmt.Println("Gathering system information...") + logging.Info("Diagnosing issue: %s", issue) + logging.Info("Gathering system information...") // Gather system information systemInfo := GatherSystemInfo() @@ -155,25 +167,27 @@ func (a *LinuxDiagnosticAgent) DiagnoseIssue(issue string) error { } content := response.Choices[0].Message.Content - fmt.Printf("\nAI Response:\n%s\n", content) + logging.Debug("AI Response: %s", content) // Parse the response to determine next action - var diagnosticResp EBPFEnhancedDiagnosticResponse + var diagnosticResp types.EBPFEnhancedDiagnosticResponse var resolutionResp ResolutionResponse // Try to parse as diagnostic response first (with eBPF support) + logging.Debug("Attempting to parse response as diagnostic...") if err := json.Unmarshal([]byte(content), &diagnosticResp); err == nil && diagnosticResp.ResponseType == "diagnostic" { + logging.Debug("Successfully parsed as diagnostic response with %d commands", len(diagnosticResp.Commands)) // Handle diagnostic phase - fmt.Printf("\nReasoning: %s\n", diagnosticResp.Reasoning) + logging.Debug("Reasoning: %s", diagnosticResp.Reasoning) // Execute commands and collect results commandResults := make([]CommandResult, 0, len(diagnosticResp.Commands)) if len(diagnosticResp.Commands) > 0 { - fmt.Printf("๐Ÿ”ง Executing diagnostic commands...\n") + logging.Info("Executing %d diagnostic commands", len(diagnosticResp.Commands)) for i, cmdStr := range diagnosticResp.Commands { - // Convert string to Command struct + // Convert string command to Command struct (auto-generate ID and description) cmd := Command{ - ID: fmt.Sprintf("cmd_%d", i), + ID: fmt.Sprintf("cmd_%d", i+1), Command: cmdStr, Description: fmt.Sprintf("Diagnostic command: %s", cmdStr), } @@ -181,7 +195,7 @@ func (a *LinuxDiagnosticAgent) DiagnoseIssue(issue string) error { commandResults = append(commandResults, result) if result.ExitCode != 0 { - fmt.Printf("โŒ Command '%s' failed with exit code %d\n", cmd.ID, result.ExitCode) + logging.Warning("Command '%s' failed with exit code %d", cmd.ID, result.ExitCode) } } } @@ -189,7 +203,7 @@ func (a *LinuxDiagnosticAgent) DiagnoseIssue(issue string) error { // Execute eBPF programs if present - support both old and new formats var ebpfResults []map[string]interface{} if len(diagnosticResp.EBPFPrograms) > 0 { - fmt.Printf("๐Ÿ”ฌ AI requested %d eBPF traces for enhanced diagnostics\n", len(diagnosticResp.EBPFPrograms)) + logging.Info("AI requested %d eBPF traces for enhanced diagnostics", len(diagnosticResp.EBPFPrograms)) // Convert EBPFPrograms to TraceSpecs and execute concurrently traceSpecs := a.convertEBPFProgramsToTraceSpecs(diagnosticResp.EBPFPrograms) @@ -210,12 +224,17 @@ func (a *LinuxDiagnosticAgent) DiagnoseIssue(issue string) error { // Extract evidence summary for TensorZero evidenceSummary := make([]string, 0) for _, result := range ebpfResults { - name := result["name"] - eventCount := result["data_points"] - description := result["description"] - status := result["status"] + target := result["target"] + eventCount := result["event_count"] + summary := result["summary"] + success := result["success"] - summaryStr := fmt.Sprintf("%s: %v events (%s) - %s", name, eventCount, status, description) + status := "failed" + if success == true { + status = "success" + } + + summaryStr := fmt.Sprintf("%s: %v events (%s) - %s", target, eventCount, status, summary) evidenceSummary = append(evidenceSummary, summaryStr) } allResults["ebpf_evidence_summary"] = evidenceSummary @@ -237,20 +256,22 @@ func (a *LinuxDiagnosticAgent) DiagnoseIssue(issue string) error { }) continue + } else { + logging.Debug("Failed to parse as diagnostic. Error: %v, ResponseType: '%s'", err, diagnosticResp.ResponseType) } // Try to parse as resolution response if err := json.Unmarshal([]byte(content), &resolutionResp); err == nil && resolutionResp.ResponseType == "resolution" { // Handle resolution phase - fmt.Printf("\n=== DIAGNOSIS COMPLETE ===\n") - fmt.Printf("Root Cause: %s\n", resolutionResp.RootCause) - fmt.Printf("Resolution Plan: %s\n", resolutionResp.ResolutionPlan) - fmt.Printf("Confidence: %s\n", resolutionResp.Confidence) + logging.Info("=== DIAGNOSIS COMPLETE ===") + logging.Info("Root Cause: %s", resolutionResp.RootCause) + logging.Info("Resolution Plan: %s", resolutionResp.ResolutionPlan) + logging.Info("Confidence: %s", resolutionResp.Confidence) break } // If we can't parse the response, treat it as an error or unexpected format - fmt.Printf("Unexpected response format or error from AI:\n%s\n", content) + logging.Error("Unexpected response format or error from AI: %s", content) break } @@ -296,8 +317,9 @@ func (a *LinuxDiagnosticAgent) sendRequestWithEpisode(messages []openai.ChatComp return nil, fmt.Errorf("SUPABASE_PROJECT_URL not set") } - // Create HTTP request to TensorZero proxy - endpoint := fmt.Sprintf("%s/functions/v1/tensorzero-proxy", supabaseURL) + // Create HTTP request to TensorZero proxy (includes OpenAI-compatible path) + endpoint := fmt.Sprintf("%s/functions/v1/tensorzero-proxy/openai/v1/chat/completions", supabaseURL) + logging.Debug("Calling TensorZero proxy at: %s", endpoint) req, err := http.NewRequest("POST", endpoint, bytes.NewBuffer(requestBody)) if err != nil { return nil, fmt.Errorf("failed to create request: %w", err) @@ -307,7 +329,17 @@ func (a *LinuxDiagnosticAgent) sendRequestWithEpisode(messages []openai.ChatComp req.Header.Set("Content-Type", "application/json") req.Header.Set("Accept", "application/json") - // Note: No authentication needed for TensorZero proxy based on the existing pattern + // Add authentication if auth manager is available (same pattern as investigation_server.go) + if a.authManager != nil { + // The authManager should be *auth.AuthManager, so let's use the exact same pattern + if authMgr, ok := a.authManager.(interface { + LoadToken() (*types.AuthToken, error) + }); ok { + if authToken, err := authMgr.LoadToken(); err == nil && authToken != nil { + req.Header.Set("Authorization", fmt.Sprintf("Bearer %s", authToken.AccessToken)) + } + } + } // Send request client := &http.Client{Timeout: 30 * time.Second} @@ -372,7 +404,7 @@ func (a *LinuxDiagnosticAgent) sendRequestWithEpisode(messages []openai.ChatComp } // convertEBPFProgramsToTraceSpecs converts old EBPFProgram format to new TraceSpec format -func (a *LinuxDiagnosticAgent) convertEBPFProgramsToTraceSpecs(ebpfPrograms []EBPFRequest) []TraceSpec { +func (a *LinuxDiagnosticAgent) convertEBPFProgramsToTraceSpecs(ebpfPrograms []types.EBPFRequest) []TraceSpec { var traceSpecs []TraceSpec for _, prog := range ebpfPrograms { @@ -384,7 +416,7 @@ func (a *LinuxDiagnosticAgent) convertEBPFProgramsToTraceSpecs(ebpfPrograms []EB } // convertToTraceSpec converts an EBPFRequest to a TraceSpec for BCC-style tracing -func (a *LinuxDiagnosticAgent) convertToTraceSpec(prog EBPFRequest) TraceSpec { +func (a *LinuxDiagnosticAgent) convertToTraceSpec(prog types.EBPFRequest) TraceSpec { // Determine probe type based on target and type probeType := "p" // default to kprobe target := prog.Target @@ -421,6 +453,7 @@ func (a *LinuxDiagnosticAgent) convertToTraceSpec(prog EBPFRequest) TraceSpec { Format: prog.Description, // Use description as format Arguments: []string{}, // Start with no arguments for compatibility Duration: duration, + UID: -1, // No UID filter (don't default to 0 which means root only) } } @@ -430,7 +463,7 @@ func (a *LinuxDiagnosticAgent) executeBCCTracesConcurrently(traceSpecs []TraceSp return []map[string]interface{}{} } - fmt.Printf("๐Ÿš€ Executing %d BCC traces with max %d concurrent tasks\n", len(traceSpecs), a.config.MaxConcurrentTasks) + logging.Info("Executing %d BCC traces with max %d concurrent tasks", len(traceSpecs), a.config.MaxConcurrentTasks) // Channel to limit concurrent goroutines semaphore := make(chan struct{}, a.config.MaxConcurrentTasks) @@ -465,7 +498,7 @@ func (a *LinuxDiagnosticAgent) executeBCCTracesConcurrently(traceSpecs []TraceSp } if a.config.CollectiveResults { - fmt.Printf("โœ… All %d BCC traces completed. Sending collective results to API layer.\n", len(allResults)) + logging.Debug("All %d BCC traces completed. Sending collective results to API layer", len(allResults)) } return allResults @@ -482,18 +515,18 @@ func (a *LinuxDiagnosticAgent) executeSingleBCCTrace(index int, spec TraceSpec) "start_time": time.Now().Format(time.RFC3339), } - fmt.Printf("๐Ÿ” [Task %d] Starting BCC trace: %s (type: %s)\n", index, spec.Target, spec.ProbeType) + logging.Debug("[Task %d] Starting BCC trace: %s (type: %s)", index, spec.Target, spec.ProbeType) // Start the trace traceID, err := a.ebpfManager.StartTrace(spec) if err != nil { result["error"] = fmt.Sprintf("Failed to start trace: %v", err) - fmt.Printf("โŒ [Task %d] Failed to start trace %s: %v\n", index, spec.Target, err) + logging.Error("[Task %d] Failed to start trace %s: %v", index, spec.Target, err) return result } result["trace_id"] = traceID - fmt.Printf("๐Ÿš€ [Task %d] Trace %s started with ID: %s\n", index, spec.Target, traceID) + logging.Debug("[Task %d] Trace %s started with ID: %s", index, spec.Target, traceID) // Wait for the trace duration time.Sleep(time.Duration(spec.Duration) * time.Second) @@ -504,7 +537,7 @@ func (a *LinuxDiagnosticAgent) executeSingleBCCTrace(index int, spec TraceSpec) // Try to stop the trace if it's still running a.ebpfManager.StopTrace(traceID) result["error"] = fmt.Sprintf("Failed to get trace results: %v", err) - fmt.Printf("โŒ [Task %d] Failed to get results for trace %s: %v\n", index, spec.Target, err) + logging.Error("[Task %d] Failed to get results for trace %s: %v", index, spec.Target, err) return result } @@ -551,7 +584,7 @@ func (a *LinuxDiagnosticAgent) executeSingleBCCTrace(index int, spec TraceSpec) result["top_processes"] = topProcesses } - fmt.Printf("โœ… [Task %d] Trace %s completed: %d events (%.2f events/sec)\n", + logging.Debug("[Task %d] Trace %s completed: %d events (%.2f events/sec)", index, spec.Target, traceResult.EventCount, traceResult.Statistics.EventsPerSecond) return result diff --git a/ebpf_event_parser.go b/ebpf_event_parser.go new file mode 100644 index 0000000..bb944ea --- /dev/null +++ b/ebpf_event_parser.go @@ -0,0 +1,343 @@ +package main + +import ( + "bufio" + "io" + "regexp" + "strconv" + "strings" + "time" +) + +// EventScanner parses bpftrace output and converts it to TraceEvent structs +type EventScanner struct { + scanner *bufio.Scanner + lastEvent *TraceEvent + lineRegex *regexp.Regexp +} + +// NewEventScanner creates a new event scanner for parsing bpftrace output +func NewEventScanner(reader io.Reader) *EventScanner { + // Regex pattern to match our trace output format: + // TRACE|timestamp|pid|tid|comm|function|message + pattern := `^TRACE\|(\d+)\|(\d+)\|(\d+)\|([^|]+)\|([^|]+)\|(.*)$` + regex, _ := regexp.Compile(pattern) + + return &EventScanner{ + scanner: bufio.NewScanner(reader), + lineRegex: regex, + } +} + +// Scan advances the scanner to the next event +func (es *EventScanner) Scan() bool { + for es.scanner.Scan() { + line := strings.TrimSpace(es.scanner.Text()) + + // Skip empty lines and non-trace lines + if line == "" || !strings.HasPrefix(line, "TRACE|") { + continue + } + + // Parse the trace line + if event := es.parseLine(line); event != nil { + es.lastEvent = event + return true + } + } + + return false +} + +// Event returns the most recently parsed event +func (es *EventScanner) Event() *TraceEvent { + return es.lastEvent +} + +// Error returns any scanning error +func (es *EventScanner) Error() error { + return es.scanner.Err() +} + +// parseLine parses a single trace line into a TraceEvent +func (es *EventScanner) parseLine(line string) *TraceEvent { + matches := es.lineRegex.FindStringSubmatch(line) + if len(matches) != 7 { + return nil + } + + // Parse timestamp (nanoseconds) + timestamp, err := strconv.ParseInt(matches[1], 10, 64) + if err != nil { + return nil + } + + // Parse PID + pid, err := strconv.Atoi(matches[2]) + if err != nil { + return nil + } + + // Parse TID + tid, err := strconv.Atoi(matches[3]) + if err != nil { + return nil + } + + // Extract process name, function, and message + processName := strings.TrimSpace(matches[4]) + function := strings.TrimSpace(matches[5]) + message := strings.TrimSpace(matches[6]) + + event := &TraceEvent{ + Timestamp: timestamp, + PID: pid, + TID: tid, + ProcessName: processName, + Function: function, + Message: message, + RawArgs: make(map[string]string), + } + + // Try to extract additional information from the message + es.enrichEvent(event, message) + + return event +} + +// enrichEvent extracts additional information from the message +func (es *EventScanner) enrichEvent(event *TraceEvent, message string) { + // Parse common patterns in messages to extract arguments + // This is a simplified version - in a real implementation you'd want more sophisticated parsing + + // Look for patterns like "arg1=value, arg2=value" + argPattern := regexp.MustCompile(`(\w+)=([^,\s]+)`) + matches := argPattern.FindAllStringSubmatch(message, -1) + + for _, match := range matches { + if len(match) == 3 { + event.RawArgs[match[1]] = match[2] + } + } + + // Look for numeric patterns that might be syscall arguments + numberPattern := regexp.MustCompile(`\b(\d+)\b`) + numbers := numberPattern.FindAllString(message, -1) + + for i, num := range numbers { + argName := "arg" + strconv.Itoa(i+1) + event.RawArgs[argName] = num + } +} + +// TraceEventFilter provides filtering capabilities for trace events +type TraceEventFilter struct { + MinTimestamp int64 + MaxTimestamp int64 + ProcessNames []string + PIDs []int + UIDs []int + Functions []string + MessageFilter string +} + +// ApplyFilter applies filters to a slice of events +func (filter *TraceEventFilter) ApplyFilter(events []TraceEvent) []TraceEvent { + if filter == nil { + return events + } + + var filtered []TraceEvent + + for _, event := range events { + if filter.matchesEvent(&event) { + filtered = append(filtered, event) + } + } + + return filtered +} + +// matchesEvent checks if an event matches the filter criteria +func (filter *TraceEventFilter) matchesEvent(event *TraceEvent) bool { + // Check timestamp range + if filter.MinTimestamp > 0 && event.Timestamp < filter.MinTimestamp { + return false + } + if filter.MaxTimestamp > 0 && event.Timestamp > filter.MaxTimestamp { + return false + } + + // Check process names + if len(filter.ProcessNames) > 0 { + found := false + for _, name := range filter.ProcessNames { + if strings.Contains(event.ProcessName, name) { + found = true + break + } + } + if !found { + return false + } + } + + // Check PIDs + if len(filter.PIDs) > 0 { + found := false + for _, pid := range filter.PIDs { + if event.PID == pid { + found = true + break + } + } + if !found { + return false + } + } + + // Check UIDs + if len(filter.UIDs) > 0 { + found := false + for _, uid := range filter.UIDs { + if event.UID == uid { + found = true + break + } + } + if !found { + return false + } + } + + // Check functions + if len(filter.Functions) > 0 { + found := false + for _, function := range filter.Functions { + if strings.Contains(event.Function, function) { + found = true + break + } + } + if !found { + return false + } + } + + // Check message filter + if filter.MessageFilter != "" { + if !strings.Contains(event.Message, filter.MessageFilter) { + return false + } + } + + return true +} + +// TraceEventAggregator provides aggregation capabilities for trace events +type TraceEventAggregator struct { + events []TraceEvent +} + +// NewTraceEventAggregator creates a new event aggregator +func NewTraceEventAggregator(events []TraceEvent) *TraceEventAggregator { + return &TraceEventAggregator{ + events: events, + } +} + +// CountByProcess returns event counts grouped by process +func (agg *TraceEventAggregator) CountByProcess() map[string]int { + counts := make(map[string]int) + for _, event := range agg.events { + counts[event.ProcessName]++ + } + return counts +} + +// CountByFunction returns event counts grouped by function +func (agg *TraceEventAggregator) CountByFunction() map[string]int { + counts := make(map[string]int) + for _, event := range agg.events { + counts[event.Function]++ + } + return counts +} + +// CountByPID returns event counts grouped by PID +func (agg *TraceEventAggregator) CountByPID() map[int]int { + counts := make(map[int]int) + for _, event := range agg.events { + counts[event.PID]++ + } + return counts +} + +// GetTimeRange returns the time range of events +func (agg *TraceEventAggregator) GetTimeRange() (int64, int64) { + if len(agg.events) == 0 { + return 0, 0 + } + + minTime := agg.events[0].Timestamp + maxTime := agg.events[0].Timestamp + + for _, event := range agg.events { + if event.Timestamp < minTime { + minTime = event.Timestamp + } + if event.Timestamp > maxTime { + maxTime = event.Timestamp + } + } + + return minTime, maxTime +} + +// GetEventRate calculates events per second +func (agg *TraceEventAggregator) GetEventRate() float64 { + if len(agg.events) < 2 { + return 0 + } + + minTime, maxTime := agg.GetTimeRange() + durationNs := maxTime - minTime + durationSeconds := float64(durationNs) / float64(time.Second) + + if durationSeconds == 0 { + return 0 + } + + return float64(len(agg.events)) / durationSeconds +} + +// GetTopProcesses returns the most active processes +func (agg *TraceEventAggregator) GetTopProcesses(limit int) []ProcessStat { + counts := agg.CountByProcess() + total := len(agg.events) + + var stats []ProcessStat + for processName, count := range counts { + percentage := float64(count) / float64(total) * 100 + stats = append(stats, ProcessStat{ + ProcessName: processName, + EventCount: count, + Percentage: percentage, + }) + } + + // Simple sorting by event count (bubble sort for simplicity) + for i := 0; i < len(stats); i++ { + for j := i + 1; j < len(stats); j++ { + if stats[j].EventCount > stats[i].EventCount { + stats[i], stats[j] = stats[j], stats[i] + } + } + } + + if limit > 0 && limit < len(stats) { + stats = stats[:limit] + } + + return stats +} diff --git a/ebpf_trace_manager.go b/ebpf_trace_manager.go new file mode 100644 index 0000000..9dd35b3 --- /dev/null +++ b/ebpf_trace_manager.go @@ -0,0 +1,587 @@ +package main + +import ( + "context" + "fmt" + "io" + "os" + "os/exec" + "strings" + "sync" + "time" + + "nannyagentv2/internal/logging" +) + +// TraceSpec represents a trace specification similar to BCC trace.py +type TraceSpec struct { + // Probe type: "p" (kprobe), "r" (kretprobe), "t" (tracepoint), "u" (uprobe) + ProbeType string `json:"probe_type"` + + // Target function/syscall/tracepoint + Target string `json:"target"` + + // Library for userspace probes (empty for kernel) + Library string `json:"library,omitempty"` + + // Format string for output (e.g., "read %d bytes", arg3) + Format string `json:"format"` + + // Arguments to extract (e.g., ["arg1", "arg2", "retval"]) + Arguments []string `json:"arguments"` + + // Filter condition (e.g., "arg3 > 20000") + Filter string `json:"filter,omitempty"` + + // Duration in seconds + Duration int `json:"duration"` + + // Process ID filter (optional) + PID int `json:"pid,omitempty"` + + // Thread ID filter (optional) + TID int `json:"tid,omitempty"` + + // UID filter (optional) + UID int `json:"uid,omitempty"` + + // Process name filter (optional) + ProcessName string `json:"process_name,omitempty"` +} + +// TraceEvent represents a captured event from eBPF +type TraceEvent struct { + Timestamp int64 `json:"timestamp"` + PID int `json:"pid"` + TID int `json:"tid"` + UID int `json:"uid"` + ProcessName string `json:"process_name"` + Function string `json:"function"` + Message string `json:"message"` + RawArgs map[string]string `json:"raw_args"` + CPU int `json:"cpu,omitempty"` +} + +// TraceResult represents the results of a tracing session +type TraceResult struct { + TraceID string `json:"trace_id"` + Spec TraceSpec `json:"spec"` + Events []TraceEvent `json:"events"` + EventCount int `json:"event_count"` + StartTime time.Time `json:"start_time"` + EndTime time.Time `json:"end_time"` + Summary string `json:"summary"` + Statistics TraceStats `json:"statistics"` +} + +// TraceStats provides statistics about the trace +type TraceStats struct { + TotalEvents int `json:"total_events"` + EventsByProcess map[string]int `json:"events_by_process"` + EventsByUID map[int]int `json:"events_by_uid"` + EventsPerSecond float64 `json:"events_per_second"` + TopProcesses []ProcessStat `json:"top_processes"` +} + +// ProcessStat represents statistics for a process +type ProcessStat struct { + ProcessName string `json:"process_name"` + PID int `json:"pid"` + EventCount int `json:"event_count"` + Percentage float64 `json:"percentage"` +} + +// BCCTraceManager implements advanced eBPF tracing similar to BCC trace.py +type BCCTraceManager struct { + traces map[string]*RunningTrace + tracesLock sync.RWMutex + traceCounter int + capabilities map[string]bool +} + +// RunningTrace represents an active trace session +type RunningTrace struct { + ID string + Spec TraceSpec + Process *exec.Cmd + Events []TraceEvent + StartTime time.Time + Cancel context.CancelFunc + Context context.Context + Done chan struct{} // Signal when trace monitoring is complete +} + +// NewBCCTraceManager creates a new BCC-style trace manager +func NewBCCTraceManager() *BCCTraceManager { + manager := &BCCTraceManager{ + traces: make(map[string]*RunningTrace), + capabilities: make(map[string]bool), + } + + manager.testCapabilities() + return manager +} + +// testCapabilities checks what tracing capabilities are available +func (tm *BCCTraceManager) testCapabilities() { + // Test if bpftrace is available + if _, err := exec.LookPath("bpftrace"); err == nil { + tm.capabilities["bpftrace"] = true + } else { + tm.capabilities["bpftrace"] = false + } + + // Test if perf is available for fallback + if _, err := exec.LookPath("perf"); err == nil { + tm.capabilities["perf"] = true + } else { + tm.capabilities["perf"] = false + } + + // Test root privileges (required for eBPF) + tm.capabilities["root_access"] = os.Geteuid() == 0 + + // Test kernel version + cmd := exec.Command("uname", "-r") + output, err := cmd.Output() + if err == nil { + version := strings.TrimSpace(string(output)) + // eBPF requires kernel 4.4+ + tm.capabilities["kernel_ebpf"] = !strings.HasPrefix(version, "3.") + } else { + tm.capabilities["kernel_ebpf"] = false + } + + // Test if we can access debugfs + if _, err := os.Stat("/sys/kernel/debug/tracing/available_events"); err == nil { + tm.capabilities["debugfs_access"] = true + } else { + tm.capabilities["debugfs_access"] = false + } + + logging.Debug("BCC Trace capabilities: %+v", tm.capabilities) +} + +// GetCapabilities returns available tracing capabilities +func (tm *BCCTraceManager) GetCapabilities() map[string]bool { + tm.tracesLock.RLock() + defer tm.tracesLock.RUnlock() + + caps := make(map[string]bool) + for k, v := range tm.capabilities { + caps[k] = v + } + return caps +} + +// StartTrace starts a new trace session based on the specification +func (tm *BCCTraceManager) StartTrace(spec TraceSpec) (string, error) { + if !tm.capabilities["bpftrace"] { + return "", fmt.Errorf("bpftrace not available - install bpftrace package") + } + + if !tm.capabilities["root_access"] { + return "", fmt.Errorf("root access required for eBPF tracing") + } + + if !tm.capabilities["kernel_ebpf"] { + return "", fmt.Errorf("kernel version does not support eBPF") + } + + tm.tracesLock.Lock() + defer tm.tracesLock.Unlock() + + // Generate trace ID + tm.traceCounter++ + traceID := fmt.Sprintf("trace_%d", tm.traceCounter) + + // Generate bpftrace script + script, err := tm.generateBpftraceScript(spec) + if err != nil { + return "", fmt.Errorf("failed to generate bpftrace script: %w", err) + } + + // Debug: log the generated script + logging.Debug("Generated bpftrace script for %s:\n%s", spec.Target, script) + + // Create context with timeout + ctx, cancel := context.WithTimeout(context.Background(), time.Duration(spec.Duration)*time.Second) + + // Start bpftrace process + cmd := exec.CommandContext(ctx, "bpftrace", "-e", script) + + // Create stdout pipe BEFORE starting + stdout, err := cmd.StdoutPipe() + if err != nil { + cancel() + return "", fmt.Errorf("failed to create stdout pipe: %w", err) + } + + trace := &RunningTrace{ + ID: traceID, + Spec: spec, + Process: cmd, + Events: []TraceEvent{}, + StartTime: time.Now(), + Cancel: cancel, + Context: ctx, + Done: make(chan struct{}), // Initialize completion signal + } + + // Start the trace + if err := cmd.Start(); err != nil { + cancel() + return "", fmt.Errorf("failed to start bpftrace: %w", err) + } + + tm.traces[traceID] = trace + + // Monitor the trace in a goroutine + go tm.monitorTrace(traceID, stdout) + + logging.Debug("Started BCC-style trace %s for target %s", traceID, spec.Target) + return traceID, nil +} // generateBpftraceScript generates a bpftrace script based on the trace specification +func (tm *BCCTraceManager) generateBpftraceScript(spec TraceSpec) (string, error) { + var script strings.Builder + + // Build probe specification + var probe string + switch spec.ProbeType { + case "p", "": // kprobe (default) + if strings.HasPrefix(spec.Target, "sys_") || strings.HasPrefix(spec.Target, "__x64_sys_") { + probe = fmt.Sprintf("kprobe:%s", spec.Target) + } else { + probe = fmt.Sprintf("kprobe:%s", spec.Target) + } + case "r": // kretprobe + if strings.HasPrefix(spec.Target, "sys_") || strings.HasPrefix(spec.Target, "__x64_sys_") { + probe = fmt.Sprintf("kretprobe:%s", spec.Target) + } else { + probe = fmt.Sprintf("kretprobe:%s", spec.Target) + } + case "t": // tracepoint + // If target already includes tracepoint prefix, use as-is + if strings.HasPrefix(spec.Target, "tracepoint:") { + probe = spec.Target + } else { + probe = fmt.Sprintf("tracepoint:%s", spec.Target) + } + case "u": // uprobe + if spec.Library == "" { + return "", fmt.Errorf("library required for uprobe") + } + probe = fmt.Sprintf("uprobe:%s:%s", spec.Library, spec.Target) + default: + return "", fmt.Errorf("unsupported probe type: %s", spec.ProbeType) + } + + // Add BEGIN block + script.WriteString("BEGIN {\n") + script.WriteString(fmt.Sprintf(" printf(\"Starting trace for %s...\\n\");\n", spec.Target)) + script.WriteString("}\n\n") + + // Build the main probe + script.WriteString(fmt.Sprintf("%s {\n", probe)) + + // Add filters if specified + if tm.needsFiltering(spec) { + script.WriteString(" if (") + filters := tm.buildFilters(spec) + script.WriteString(strings.Join(filters, " && ")) + script.WriteString(") {\n") + } + + // Build output format + outputFormat := tm.buildOutputFormat(spec) + script.WriteString(fmt.Sprintf(" printf(\"%s\\n\"", outputFormat)) + + // Add arguments + args := tm.buildArgumentList(spec) + if len(args) > 0 { + script.WriteString(", ") + script.WriteString(strings.Join(args, ", ")) + } + + script.WriteString(");\n") + + // Close filter if block + if tm.needsFiltering(spec) { + script.WriteString(" }\n") + } + + script.WriteString("}\n\n") + + // Add END block + script.WriteString("END {\n") + script.WriteString(fmt.Sprintf(" printf(\"Trace completed for %s\\n\");\n", spec.Target)) + script.WriteString("}\n") + + return script.String(), nil +} + +// needsFiltering checks if any filters are needed +func (tm *BCCTraceManager) needsFiltering(spec TraceSpec) bool { + return spec.PID != 0 || spec.TID != 0 || spec.UID != -1 || + spec.ProcessName != "" || spec.Filter != "" +} + +// buildFilters builds the filter conditions +func (tm *BCCTraceManager) buildFilters(spec TraceSpec) []string { + var filters []string + + if spec.PID != 0 { + filters = append(filters, fmt.Sprintf("pid == %d", spec.PID)) + } + + if spec.TID != 0 { + filters = append(filters, fmt.Sprintf("tid == %d", spec.TID)) + } + + if spec.UID != -1 { + filters = append(filters, fmt.Sprintf("uid == %d", spec.UID)) + } + + if spec.ProcessName != "" { + filters = append(filters, fmt.Sprintf("strncmp(comm, \"%s\", %d) == 0", spec.ProcessName, len(spec.ProcessName))) + } + + // Add custom filter + if spec.Filter != "" { + // Convert common patterns to bpftrace syntax + customFilter := strings.ReplaceAll(spec.Filter, "arg", "arg") + filters = append(filters, customFilter) + } + + return filters +} + +// buildOutputFormat creates the output format string +func (tm *BCCTraceManager) buildOutputFormat(spec TraceSpec) string { + if spec.Format != "" { + // Use custom format + return fmt.Sprintf("TRACE|%%d|%%d|%%d|%%s|%s|%s", spec.Target, spec.Format) + } + + // Default format + return fmt.Sprintf("TRACE|%%d|%%d|%%d|%%s|%s|called", spec.Target) +} + +// buildArgumentList creates the argument list for printf +func (tm *BCCTraceManager) buildArgumentList(spec TraceSpec) []string { + // Always include timestamp, pid, tid, comm + args := []string{"nsecs", "pid", "tid", "comm"} + + // Add custom arguments + for _, arg := range spec.Arguments { + switch arg { + case "arg1", "arg2", "arg3", "arg4", "arg5", "arg6": + args = append(args, fmt.Sprintf("arg%s", strings.TrimPrefix(arg, "arg"))) + case "retval": + args = append(args, "retval") + case "cpu": + args = append(args, "cpu") + default: + // Custom expression + args = append(args, arg) + } + } + + return args +} + +// monitorTrace monitors a running trace and collects events +func (tm *BCCTraceManager) monitorTrace(traceID string, stdout io.ReadCloser) { + tm.tracesLock.Lock() + trace, exists := tm.traces[traceID] + if !exists { + tm.tracesLock.Unlock() + return + } + tm.tracesLock.Unlock() + + // Start reading output in a goroutine + go func() { + scanner := NewEventScanner(stdout) + for scanner.Scan() { + event := scanner.Event() + if event != nil { + tm.tracesLock.Lock() + if t, exists := tm.traces[traceID]; exists { + t.Events = append(t.Events, *event) + } + tm.tracesLock.Unlock() + } + } + stdout.Close() + }() + + // Wait for the process to complete + err := trace.Process.Wait() + + // Clean up + trace.Cancel() + + tm.tracesLock.Lock() + if err != nil && err.Error() != "signal: killed" { + logging.Warning("Trace %s completed with error: %v", traceID, err) + } else { + logging.Debug("Trace %s completed successfully with %d events", + traceID, len(trace.Events)) + } + + // Signal that monitoring is complete + close(trace.Done) + tm.tracesLock.Unlock() +} + +// GetTraceResult returns the results of a completed trace +func (tm *BCCTraceManager) GetTraceResult(traceID string) (*TraceResult, error) { + tm.tracesLock.RLock() + trace, exists := tm.traces[traceID] + if !exists { + tm.tracesLock.RUnlock() + return nil, fmt.Errorf("trace %s not found", traceID) + } + tm.tracesLock.RUnlock() + + // Wait for trace monitoring to complete + select { + case <-trace.Done: + // Trace monitoring completed + case <-time.After(5 * time.Second): + // Timeout waiting for completion + return nil, fmt.Errorf("timeout waiting for trace %s to complete", traceID) + } + + // Now safely read the final results + tm.tracesLock.RLock() + defer tm.tracesLock.RUnlock() + + result := &TraceResult{ + TraceID: traceID, + Spec: trace.Spec, + Events: make([]TraceEvent, len(trace.Events)), + EventCount: len(trace.Events), + StartTime: trace.StartTime, + EndTime: time.Now(), + } + + copy(result.Events, trace.Events) + + // Calculate statistics + result.Statistics = tm.calculateStatistics(result.Events, result.EndTime.Sub(result.StartTime)) + + // Generate summary + result.Summary = tm.generateSummary(result) + + return result, nil +} + +// calculateStatistics calculates statistics for the trace results +func (tm *BCCTraceManager) calculateStatistics(events []TraceEvent, duration time.Duration) TraceStats { + stats := TraceStats{ + TotalEvents: len(events), + EventsByProcess: make(map[string]int), + EventsByUID: make(map[int]int), + } + + if duration > 0 { + stats.EventsPerSecond = float64(len(events)) / duration.Seconds() + } + + // Calculate per-process and per-UID statistics + for _, event := range events { + stats.EventsByProcess[event.ProcessName]++ + stats.EventsByUID[event.UID]++ + } + + // Calculate top processes + for processName, count := range stats.EventsByProcess { + percentage := float64(count) / float64(len(events)) * 100 + stats.TopProcesses = append(stats.TopProcesses, ProcessStat{ + ProcessName: processName, + EventCount: count, + Percentage: percentage, + }) + } + + return stats +} + +// generateSummary generates a human-readable summary +func (tm *BCCTraceManager) generateSummary(result *TraceResult) string { + duration := result.EndTime.Sub(result.StartTime) + + summary := fmt.Sprintf("Traced %s for %v, captured %d events (%.2f events/sec)", + result.Spec.Target, duration, result.EventCount, result.Statistics.EventsPerSecond) + + if len(result.Statistics.TopProcesses) > 0 { + summary += fmt.Sprintf(", top process: %s (%d events)", + result.Statistics.TopProcesses[0].ProcessName, + result.Statistics.TopProcesses[0].EventCount) + } + + return summary +} + +// StopTrace stops an active trace +func (tm *BCCTraceManager) StopTrace(traceID string) error { + tm.tracesLock.Lock() + defer tm.tracesLock.Unlock() + + trace, exists := tm.traces[traceID] + if !exists { + return fmt.Errorf("trace %s not found", traceID) + } + + if trace.Process.ProcessState == nil { + // Process is still running, kill it + if err := trace.Process.Process.Kill(); err != nil { + return fmt.Errorf("failed to stop trace: %w", err) + } + } + + trace.Cancel() + return nil +} + +// ListActiveTraces returns a list of active trace IDs +func (tm *BCCTraceManager) ListActiveTraces() []string { + tm.tracesLock.RLock() + defer tm.tracesLock.RUnlock() + + var active []string + for id, trace := range tm.traces { + if trace.Process.ProcessState == nil { + active = append(active, id) + } + } + + return active +} + +// GetSummary returns a summary of the trace manager state +func (tm *BCCTraceManager) GetSummary() map[string]interface{} { + tm.tracesLock.RLock() + defer tm.tracesLock.RUnlock() + + activeCount := 0 + completedCount := 0 + + for _, trace := range tm.traces { + if trace.Process.ProcessState == nil { + activeCount++ + } else { + completedCount++ + } + } + + return map[string]interface{}{ + "capabilities": tm.capabilities, + "active_traces": activeCount, + "completed_traces": completedCount, + "total_traces": len(tm.traces), + "active_trace_ids": tm.ListActiveTraces(), + } +} diff --git a/ebpf_trace_specs.go b/ebpf_trace_specs.go new file mode 100644 index 0000000..3fd137a --- /dev/null +++ b/ebpf_trace_specs.go @@ -0,0 +1,396 @@ +package main + +import ( + "encoding/json" + "fmt" + "strings" +) + +// TestTraceSpecs provides test trace specifications for unit testing the BCC-style tracing +// These are used to validate the tracing functionality without requiring remote API calls +var TestTraceSpecs = map[string]TraceSpec{ + // Basic system call tracing for testing + "test_sys_open": { + ProbeType: "p", + Target: "__x64_sys_openat", + Format: "opening file: %s", + Arguments: []string{"arg2@user"}, // filename + Duration: 5, // Short duration for testing + }, + + "test_sys_read": { + ProbeType: "p", + Target: "__x64_sys_read", + Format: "read %d bytes from fd %d", + Arguments: []string{"arg3", "arg1"}, // count, fd + Filter: "arg3 > 100", // Only reads >100 bytes for testing + Duration: 5, + }, + + "test_sys_write": { + ProbeType: "p", + Target: "__x64_sys_write", + Format: "write %d bytes to fd %d", + Arguments: []string{"arg3", "arg1"}, // count, fd + Duration: 5, + }, + + "test_process_creation": { + ProbeType: "p", + Target: "__x64_sys_execve", + Format: "exec: %s", + Arguments: []string{"arg1@user"}, // filename + Duration: 5, + }, + + // Test with different probe types + "test_kretprobe": { + ProbeType: "r", + Target: "__x64_sys_openat", + Format: "open returned: %d", + Arguments: []string{"retval"}, + Duration: 5, + }, + + "test_with_filter": { + ProbeType: "p", + Target: "__x64_sys_write", + Format: "stdout write: %d bytes", + Arguments: []string{"arg3"}, + Filter: "arg1 == 1", // Only stdout writes + Duration: 5, + }, +} + +// GetTestSpec returns a pre-defined test trace specification +func GetTestSpec(name string) (TraceSpec, bool) { + spec, exists := TestTraceSpecs[name] + return spec, exists +} + +// ListTestSpecs returns all available test trace specifications +func ListTestSpecs() map[string]string { + descriptions := map[string]string{ + "test_sys_open": "Test file open operations", + "test_sys_read": "Test read operations (>100 bytes)", + "test_sys_write": "Test write operations", + "test_process_creation": "Test process execution", + "test_kretprobe": "Test kretprobe on file open", + "test_with_filter": "Test filtered writes to stdout", + } + + return descriptions +} + +// TraceSpecBuilder helps build custom trace specifications +type TraceSpecBuilder struct { + spec TraceSpec +} + +// NewTraceSpecBuilder creates a new trace specification builder +func NewTraceSpecBuilder() *TraceSpecBuilder { + return &TraceSpecBuilder{ + spec: TraceSpec{ + ProbeType: "p", // Default to kprobe + Duration: 30, // Default 30 seconds + }, + } +} + +// Kprobe sets up a kernel probe +func (b *TraceSpecBuilder) Kprobe(function string) *TraceSpecBuilder { + b.spec.ProbeType = "p" + b.spec.Target = function + return b +} + +// Kretprobe sets up a kernel return probe +func (b *TraceSpecBuilder) Kretprobe(function string) *TraceSpecBuilder { + b.spec.ProbeType = "r" + b.spec.Target = function + return b +} + +// Tracepoint sets up a tracepoint +func (b *TraceSpecBuilder) Tracepoint(category, name string) *TraceSpecBuilder { + b.spec.ProbeType = "t" + b.spec.Target = fmt.Sprintf("%s:%s", category, name) + return b +} + +// Uprobe sets up a userspace probe +func (b *TraceSpecBuilder) Uprobe(library, function string) *TraceSpecBuilder { + b.spec.ProbeType = "u" + b.spec.Library = library + b.spec.Target = function + return b +} + +// Format sets the output format string +func (b *TraceSpecBuilder) Format(format string, args ...string) *TraceSpecBuilder { + b.spec.Format = format + b.spec.Arguments = args + return b +} + +// Filter adds a filter condition +func (b *TraceSpecBuilder) Filter(condition string) *TraceSpecBuilder { + b.spec.Filter = condition + return b +} + +// Duration sets the trace duration in seconds +func (b *TraceSpecBuilder) Duration(seconds int) *TraceSpecBuilder { + b.spec.Duration = seconds + return b +} + +// PID filters by process ID +func (b *TraceSpecBuilder) PID(pid int) *TraceSpecBuilder { + b.spec.PID = pid + return b +} + +// UID filters by user ID +func (b *TraceSpecBuilder) UID(uid int) *TraceSpecBuilder { + b.spec.UID = uid + return b +} + +// ProcessName filters by process name +func (b *TraceSpecBuilder) ProcessName(name string) *TraceSpecBuilder { + b.spec.ProcessName = name + return b +} + +// Build returns the constructed trace specification +func (b *TraceSpecBuilder) Build() TraceSpec { + return b.spec +} + +// TraceSpecParser parses trace specifications from various formats +type TraceSpecParser struct{} + +// NewTraceSpecParser creates a new parser +func NewTraceSpecParser() *TraceSpecParser { + return &TraceSpecParser{} +} + +// ParseFromBCCStyle parses BCC trace.py style specifications +// Examples: +// +// "sys_open" -> trace sys_open syscall +// "p::do_sys_open" -> kprobe on do_sys_open +// "r::do_sys_open" -> kretprobe on do_sys_open +// "t:syscalls:sys_enter_open" -> tracepoint +// "sys_read (arg3 > 1024)" -> with filter +// "sys_read \"read %d bytes\", arg3" -> with format +func (p *TraceSpecParser) ParseFromBCCStyle(spec string) (TraceSpec, error) { + result := TraceSpec{ + ProbeType: "p", + Duration: 30, + } + + // Split by quotes to separate format string + parts := strings.Split(spec, "\"") + + var probeSpec string + if len(parts) >= 1 { + probeSpec = strings.TrimSpace(parts[0]) + } + + var formatPart string + if len(parts) >= 2 { + formatPart = parts[1] + } + + var argsPart string + if len(parts) >= 3 { + argsPart = strings.TrimSpace(parts[2]) + if strings.HasPrefix(argsPart, ",") { + argsPart = strings.TrimSpace(argsPart[1:]) + } + } + + // Parse probe specification + if err := p.parseProbeSpec(probeSpec, &result); err != nil { + return result, err + } + + // Parse format string + if formatPart != "" { + result.Format = formatPart + } + + // Parse arguments + if argsPart != "" { + result.Arguments = p.parseArguments(argsPart) + } + + return result, nil +} + +// parseProbeSpec parses the probe specification part +func (p *TraceSpecParser) parseProbeSpec(spec string, result *TraceSpec) error { + // Handle filter conditions in parentheses + if idx := strings.Index(spec, "("); idx != -1 { + filterEnd := strings.LastIndex(spec, ")") + if filterEnd > idx { + result.Filter = strings.TrimSpace(spec[idx+1 : filterEnd]) + spec = strings.TrimSpace(spec[:idx]) + } + } + + // Parse probe type and target + if strings.Contains(spec, ":") { + parts := strings.SplitN(spec, ":", 3) + + if len(parts) >= 1 && parts[0] != "" { + switch parts[0] { + case "p": + result.ProbeType = "p" + case "r": + result.ProbeType = "r" + case "t": + result.ProbeType = "t" + case "u": + result.ProbeType = "u" + default: + return fmt.Errorf("unsupported probe type: %s", parts[0]) + } + } + + if len(parts) >= 2 { + result.Library = parts[1] + } + + if len(parts) >= 3 { + result.Target = parts[2] + } else if len(parts) == 2 { + result.Target = parts[1] + result.Library = "" + } + } else { + // Simple function name + result.Target = spec + + // Auto-detect syscall format + if strings.HasPrefix(spec, "sys_") && !strings.HasPrefix(spec, "__x64_sys_") { + result.Target = "__x64_sys_" + spec[4:] + } + } + + return nil +} + +// parseArguments parses the arguments part +func (p *TraceSpecParser) parseArguments(args string) []string { + var result []string + + // Split by comma and clean up + parts := strings.Split(args, ",") + for _, part := range parts { + arg := strings.TrimSpace(part) + if arg != "" { + result = append(result, arg) + } + } + + return result +} + +// ParseFromJSON parses trace specification from JSON +func (p *TraceSpecParser) ParseFromJSON(jsonData []byte) (TraceSpec, error) { + var spec TraceSpec + err := json.Unmarshal(jsonData, &spec) + return spec, err +} + +// GetCommonSpec returns a pre-defined test trace specification (renamed for backward compatibility) +func GetCommonSpec(name string) (TraceSpec, bool) { + // Map old names to new test names for compatibility + testName := name + if strings.HasPrefix(name, "trace_") { + testName = strings.Replace(name, "trace_", "test_", 1) + } + + spec, exists := TestTraceSpecs[testName] + return spec, exists +} + +// ListCommonSpecs returns all available test trace specifications (renamed for backward compatibility) +func ListCommonSpecs() map[string]string { + return ListTestSpecs() +} + +// ValidateTraceSpec validates a trace specification +func ValidateTraceSpec(spec TraceSpec) error { + if spec.Target == "" { + return fmt.Errorf("target function/syscall is required") + } + + if spec.Duration <= 0 { + return fmt.Errorf("duration must be positive") + } + + if spec.Duration > 600 { // 10 minutes max + return fmt.Errorf("duration too long (max 600 seconds)") + } + + switch spec.ProbeType { + case "p", "r", "t", "u": + // Valid probe types + case "": + // Default to kprobe + default: + return fmt.Errorf("unsupported probe type: %s", spec.ProbeType) + } + + if spec.ProbeType == "u" && spec.Library == "" { + return fmt.Errorf("library required for userspace probes") + } + + if spec.ProbeType == "t" && !strings.Contains(spec.Target, ":") { + return fmt.Errorf("tracepoint requires format 'category:name'") + } + + return nil +} + +// SuggestSyscallTargets suggests syscall targets based on the issue description +func SuggestSyscallTargets(issueDescription string) []string { + description := strings.ToLower(issueDescription) + var suggestions []string + + // File I/O issues + if strings.Contains(description, "file") || strings.Contains(description, "disk") || strings.Contains(description, "io") { + suggestions = append(suggestions, "trace_sys_open", "trace_sys_read", "trace_sys_write", "trace_sys_unlink") + } + + // Network issues + if strings.Contains(description, "network") || strings.Contains(description, "socket") || strings.Contains(description, "connection") { + suggestions = append(suggestions, "trace_sys_connect", "trace_sys_socket", "trace_sys_bind", "trace_sys_accept") + } + + // Process issues + if strings.Contains(description, "process") || strings.Contains(description, "crash") || strings.Contains(description, "exec") { + suggestions = append(suggestions, "trace_sys_execve", "trace_sys_clone", "trace_sys_exit", "trace_sys_kill") + } + + // Memory issues + if strings.Contains(description, "memory") || strings.Contains(description, "malloc") || strings.Contains(description, "leak") { + suggestions = append(suggestions, "trace_sys_mmap", "trace_sys_brk") + } + + // Performance issues - trace common syscalls + if strings.Contains(description, "slow") || strings.Contains(description, "performance") || strings.Contains(description, "hang") { + suggestions = append(suggestions, "trace_sys_read", "trace_sys_write", "trace_sys_connect", "trace_sys_mmap") + } + + // If no specific suggestions, provide general monitoring + if len(suggestions) == 0 { + suggestions = append(suggestions, "trace_sys_execve", "trace_sys_open", "trace_sys_connect") + } + + return suggestions +} diff --git a/ebpf_trace_test.go b/ebpf_trace_test.go new file mode 100644 index 0000000..2930baf --- /dev/null +++ b/ebpf_trace_test.go @@ -0,0 +1,878 @@ +package main + +import ( + "encoding/json" + "fmt" + "os" + "strings" + "testing" + "time" +) + +// TestBCCTracing demonstrates and tests the new BCC-style tracing functionality +// This test documents the expected behavior and response format of the agent +func TestBCCTracing(t *testing.T) { + fmt.Println("=== BCC-Style eBPF Tracing Unit Tests ===") + fmt.Println() + + // Test 1: List available test specifications + t.Run("ListTestSpecs", func(t *testing.T) { + specs := ListTestSpecs() + fmt.Printf("๐Ÿ“‹ Available Test Specifications:\n") + for name, description := range specs { + fmt.Printf(" - %s: %s\n", name, description) + } + fmt.Println() + + if len(specs) == 0 { + t.Error("No test specifications available") + } + }) + + // Test 2: Parse BCC-style specifications + t.Run("ParseBCCStyle", func(t *testing.T) { + parser := NewTraceSpecParser() + + testCases := []struct { + input string + expected string + }{ + { + input: "sys_open", + expected: "__x64_sys_open", + }, + { + input: "p::do_sys_open", + expected: "do_sys_open", + }, + { + input: "r::sys_read", + expected: "sys_read", + }, + { + input: "sys_write (arg1 == 1)", + expected: "__x64_sys_write", + }, + } + + fmt.Printf("๐Ÿ” Testing BCC-style parsing:\n") + for _, tc := range testCases { + spec, err := parser.ParseFromBCCStyle(tc.input) + if err != nil { + t.Errorf("Failed to parse '%s': %v", tc.input, err) + continue + } + + fmt.Printf(" Input: '%s' -> Target: '%s', Type: '%s'\n", + tc.input, spec.Target, spec.ProbeType) + + if spec.Target != tc.expected { + t.Errorf("Expected target '%s', got '%s'", tc.expected, spec.Target) + } + } + fmt.Println() + }) + + // Test 3: Validate trace specifications + t.Run("ValidateSpecs", func(t *testing.T) { + fmt.Printf("โœ… Testing trace specification validation:\n") + + // Valid spec + validSpec := TraceSpec{ + ProbeType: "p", + Target: "__x64_sys_openat", + Format: "opening file", + Duration: 5, + } + + if err := ValidateTraceSpec(validSpec); err != nil { + t.Errorf("Valid spec failed validation: %v", err) + } else { + fmt.Printf(" โœ“ Valid specification passed\n") + } + + // Invalid spec - no target + invalidSpec := TraceSpec{ + ProbeType: "p", + Duration: 5, + } + + if err := ValidateTraceSpec(invalidSpec); err == nil { + t.Error("Invalid spec (no target) should have failed validation") + } else { + fmt.Printf(" โœ“ Invalid specification correctly rejected: %s\n", err.Error()) + } + + fmt.Println() + }) + + // Test 4: Simulate agent response format + t.Run("SimulateAgentResponse", func(t *testing.T) { + fmt.Printf("๐Ÿค– Simulating agent response for BCC-style tracing:\n") + + // Get a test specification + testSpec, exists := GetTestSpec("test_sys_open") + if !exists { + t.Fatal("test_sys_open specification not found") + } + + // Simulate what the agent would return + mockResponse := simulateTraceExecution(testSpec) + + // Print the response format + responseJSON, _ := json.MarshalIndent(mockResponse, "", " ") + fmt.Printf(" Expected Response Format:\n%s\n", string(responseJSON)) + + // Validate response structure + if mockResponse["success"] != true { + t.Error("Expected successful trace execution") + } + + if mockResponse["type"] != "bcc_trace" { + t.Error("Expected type to be 'bcc_trace'") + } + + events, hasEvents := mockResponse["events"].([]TraceEvent) + if !hasEvents || len(events) == 0 { + t.Error("Expected trace events in response") + } + + fmt.Println() + }) + + // Test 5: Test different probe types + t.Run("TestProbeTypes", func(t *testing.T) { + fmt.Printf("๐Ÿ”ฌ Testing different probe types:\n") + + probeTests := []struct { + specName string + expected string + }{ + {"test_sys_open", "kprobe"}, + {"test_kretprobe", "kretprobe"}, + {"test_with_filter", "kprobe with filter"}, + } + + for _, test := range probeTests { + spec, exists := GetTestSpec(test.specName) + if !exists { + t.Errorf("Test spec '%s' not found", test.specName) + continue + } + + response := simulateTraceExecution(spec) + fmt.Printf(" %s -> %s: %d events captured\n", + test.specName, test.expected, response["event_count"]) + } + fmt.Println() + }) + + // Test 6: Test trace spec builder + t.Run("TestTraceSpecBuilder", func(t *testing.T) { + fmt.Printf("๐Ÿ—๏ธ Testing trace specification builder:\n") + + // Build a custom trace spec + spec := NewTraceSpecBuilder(). + Kprobe("__x64_sys_write"). + Format("write syscall: %d bytes", "arg3"). + Filter("arg1 == 1"). + Duration(3). + Build() + + fmt.Printf(" Built spec: Target=%s, Format=%s, Filter=%s\n", + spec.Target, spec.Format, spec.Filter) + + if spec.Target != "__x64_sys_write" { + t.Error("Builder failed to set target correctly") + } + + if spec.ProbeType != "p" { + t.Error("Builder failed to set probe type correctly") + } + + fmt.Println() + }) +} + +// simulateTraceExecution simulates what the agent would return for a trace execution +// This documents the expected response format from the agent +func simulateTraceExecution(spec TraceSpec) map[string]interface{} { + // Simulate some trace events + events := []TraceEvent{ + { + Timestamp: time.Now().Unix(), + PID: 1234, + TID: 1234, + ProcessName: "test_process", + Function: spec.Target, + Message: fmt.Sprintf(spec.Format, "test_file.txt"), + RawArgs: map[string]string{ + "arg1": "5", + "arg2": "test_file.txt", + "arg3": "1024", + }, + }, + { + Timestamp: time.Now().Unix(), + PID: 5678, + TID: 5678, + ProcessName: "another_process", + Function: spec.Target, + Message: fmt.Sprintf(spec.Format, "data.log"), + RawArgs: map[string]string{ + "arg1": "3", + "arg2": "data.log", + "arg3": "512", + }, + }, + } + + // Simulate trace statistics + stats := TraceStats{ + TotalEvents: len(events), + EventsByProcess: map[string]int{"test_process": 1, "another_process": 1}, + EventsByUID: map[int]int{1000: 2}, + EventsPerSecond: float64(len(events)) / float64(spec.Duration), + TopProcesses: []ProcessStat{ + {ProcessName: "test_process", EventCount: 1, Percentage: 50.0}, + {ProcessName: "another_process", EventCount: 1, Percentage: 50.0}, + }, + } + + // Return the expected agent response format + return map[string]interface{}{ + "name": spec.Target, + "type": "bcc_trace", + "target": spec.Target, + "duration": spec.Duration, + "description": fmt.Sprintf("Traced %s for %d seconds", spec.Target, spec.Duration), + "status": "completed", + "success": true, + "event_count": len(events), + "events": events, + "statistics": stats, + "data_points": len(events), + "probe_type": spec.ProbeType, + "format": spec.Format, + "filter": spec.Filter, + } +} + +// TestTraceManagerCapabilities tests the trace manager capabilities +func TestTraceManagerCapabilities(t *testing.T) { + fmt.Println("=== BCC Trace Manager Capabilities Test ===") + fmt.Println() + + manager := NewBCCTraceManager() + caps := manager.GetCapabilities() + + fmt.Printf("๐Ÿ”ง Trace Manager Capabilities:\n") + for capability, available := range caps { + status := "โŒ Not Available" + if available { + status = "โœ… Available" + } + fmt.Printf(" %s: %s\n", capability, status) + } + fmt.Println() + + // Check essential capabilities + if !caps["kernel_ebpf"] { + fmt.Printf("โš ๏ธ Warning: Kernel eBPF support not detected\n") + } + + if !caps["bpftrace"] { + fmt.Printf("โš ๏ธ Warning: bpftrace not available (install with: apt install bpftrace)\n") + } + + if !caps["root_access"] { + fmt.Printf("โš ๏ธ Warning: Root access required for eBPF tracing\n") + } +} + +// BenchmarkTraceSpecParsing benchmarks the trace specification parsing +func BenchmarkTraceSpecParsing(b *testing.B) { + parser := NewTraceSpecParser() + testInput := "sys_open \"opening %s\", arg2@user" + + b.ResetTimer() + for i := 0; i < b.N; i++ { + _, err := parser.ParseFromBCCStyle(testInput) + if err != nil { + b.Fatal(err) + } + } +} + +// TestSyscallSuggestions tests the syscall suggestion functionality +func TestSyscallSuggestions(t *testing.T) { + fmt.Println("=== Syscall Suggestion Test ===") + fmt.Println() + + testCases := []struct { + issue string + expected int // minimum expected suggestions + description string + }{ + { + issue: "file not found error", + expected: 1, + description: "File I/O issue should suggest file-related syscalls", + }, + { + issue: "network connection timeout", + expected: 1, + description: "Network issue should suggest network syscalls", + }, + { + issue: "process crashes randomly", + expected: 1, + description: "Process issue should suggest process-related syscalls", + }, + { + issue: "memory leak detected", + expected: 1, + description: "Memory issue should suggest memory syscalls", + }, + { + issue: "application is slow", + expected: 1, + description: "Performance issue should suggest monitoring syscalls", + }, + } + + fmt.Printf("๐Ÿ’ก Testing syscall suggestions:\n") + for _, tc := range testCases { + suggestions := SuggestSyscallTargets(tc.issue) + fmt.Printf(" Issue: '%s' -> %d suggestions: %v\n", + tc.issue, len(suggestions), suggestions) + + if len(suggestions) < tc.expected { + t.Errorf("Expected at least %d suggestions for '%s', got %d", + tc.expected, tc.issue, len(suggestions)) + } + } + fmt.Println() +} + +// TestMain runs the tests and provides a summary +func TestMain(m *testing.M) { + fmt.Println("๐Ÿš€ Starting BCC-Style eBPF Tracing Tests") + fmt.Println("========================================") + fmt.Println() + + // Run capability check first + manager := NewBCCTraceManager() + caps := manager.GetCapabilities() + + if !caps["kernel_ebpf"] { + fmt.Println("โš ๏ธ Kernel eBPF support not detected - some tests may be limited") + } + if !caps["bpftrace"] { + fmt.Println("โš ๏ธ bpftrace not available - install with: sudo apt install bpftrace") + } + if !caps["root_access"] { + fmt.Println("โš ๏ธ Root access required for actual eBPF tracing") + } + + fmt.Println() + + // Run the tests + code := m.Run() + + fmt.Println() + fmt.Println("========================================") + if code == 0 { + fmt.Println("โœ… All BCC-Style eBPF Tracing Tests Passed!") + } else { + fmt.Println("โŒ Some tests failed") + } + + os.Exit(code) +} + +// TestBCCTraceManagerRootTest tests the actual BCC trace manager with root privileges +// This test requires root access and will only run meaningful tests when root +func TestBCCTraceManagerRootTest(t *testing.T) { + fmt.Println("=== BCC Trace Manager Root Test ===") + + // Check if running as root + if os.Geteuid() != 0 { + t.Skip("โš ๏ธ Skipping root test - not running as root (use: sudo go test -run TestBCCTraceManagerRootTest)") + return + } + + fmt.Println("โœ… Running as root - can test actual eBPF functionality") + + // Test 1: Create BCC trace manager and check capabilities + manager := NewBCCTraceManager() + caps := manager.GetCapabilities() + + fmt.Printf("๐Ÿ” BCC Trace Manager Capabilities:\n") + for cap, available := range caps { + status := "โŒ" + if available { + status = "โœ…" + } + fmt.Printf(" %s %s: %v\n", status, cap, available) + } + + // Require essential capabilities + if !caps["bpftrace"] { + t.Fatal("โŒ bpftrace not available - install bpftrace package") + } + + if !caps["root_access"] { + t.Fatal("โŒ Root access not detected") + } + + // Test 2: Create and execute a simple trace + fmt.Println("\n๐Ÿ”ฌ Testing actual eBPF trace execution...") + + spec := TraceSpec{ + ProbeType: "t", // tracepoint + Target: "syscalls:sys_enter_openat", + Format: "file access", + Arguments: []string{}, // Remove invalid arg2@user for tracepoints + Duration: 3, // 3 seconds + } + + fmt.Printf("๐Ÿ“ Starting trace: %s for %d seconds\n", spec.Target, spec.Duration) + + traceID, err := manager.StartTrace(spec) + if err != nil { + t.Fatalf("โŒ Failed to start trace: %v", err) + } + + fmt.Printf("๐Ÿš€ Trace started with ID: %s\n", traceID) + + // Generate some file access to capture + go func() { + time.Sleep(1 * time.Second) + // Create some file operations to trace + for i := 0; i < 3; i++ { + testFile := fmt.Sprintf("/tmp/bcc_test_%d.txt", i) + + // This will trigger sys_openat syscalls + if file, err := os.Create(testFile); err == nil { + file.WriteString("BCC trace test") + file.Close() + os.Remove(testFile) + } + time.Sleep(500 * time.Millisecond) + } + }() + + // Wait for trace to complete + time.Sleep(time.Duration(spec.Duration+1) * time.Second) + + // Get results + result, err := manager.GetTraceResult(traceID) + if err != nil { + // Try to stop the trace if it's still running + manager.StopTrace(traceID) + t.Fatalf("โŒ Failed to get trace results: %v", err) + } + + fmt.Printf("\n๐Ÿ“Š Trace Results Summary:\n") + fmt.Printf(" โ€ข Trace ID: %s\n", result.TraceID) + fmt.Printf(" โ€ข Target: %s\n", result.Spec.Target) + fmt.Printf(" โ€ข Duration: %v\n", result.EndTime.Sub(result.StartTime)) + fmt.Printf(" โ€ข Events captured: %d\n", result.EventCount) + fmt.Printf(" โ€ข Events per second: %.2f\n", result.Statistics.EventsPerSecond) + fmt.Printf(" โ€ข Summary: %s\n", result.Summary) + + if len(result.Events) > 0 { + fmt.Printf("\n๐Ÿ“ Sample Events (first 3):\n") + for i, event := range result.Events { + if i >= 3 { + break + } + fmt.Printf(" %d. PID:%d TID:%d Process:%s Message:%s\n", + i+1, event.PID, event.TID, event.ProcessName, event.Message) + } + + if len(result.Events) > 3 { + fmt.Printf(" ... and %d more events\n", len(result.Events)-3) + } + } + + // Test 3: Validate the trace produced real data + if result.EventCount == 0 { + fmt.Println("โš ๏ธ Warning: No events captured - this might be normal for a quiet system") + } else { + fmt.Printf("โœ… Successfully captured %d real eBPF events!\n", result.EventCount) + } + + fmt.Println("\n๐Ÿงช Testing comprehensive system tracing (Network, Disk, CPU, Memory, Userspace)...") + + testSpecs := []TraceSpec{ + // === SYSCALL TRACING === + { + ProbeType: "p", // kprobe + Target: "__x64_sys_write", + Format: "write: fd=%d count=%d", + Arguments: []string{"arg1", "arg3"}, + Duration: 2, + }, + { + ProbeType: "p", // kprobe + Target: "__x64_sys_read", + Format: "read: fd=%d count=%d", + Arguments: []string{"arg1", "arg3"}, + Duration: 2, + }, + { + ProbeType: "p", // kprobe + Target: "__x64_sys_connect", + Format: "network connect: fd=%d", + Arguments: []string{"arg1"}, + Duration: 2, + }, + { + ProbeType: "p", // kprobe + Target: "__x64_sys_accept", + Format: "network accept: fd=%d", + Arguments: []string{"arg1"}, + Duration: 2, + }, + // === BLOCK I/O TRACING === + { + ProbeType: "t", // tracepoint + Target: "block:block_io_start", + Format: "block I/O start", + Arguments: []string{}, + Duration: 2, + }, + { + ProbeType: "t", // tracepoint + Target: "block:block_io_done", + Format: "block I/O complete", + Arguments: []string{}, + Duration: 2, + }, + // === CPU SCHEDULER TRACING === + { + ProbeType: "t", // tracepoint + Target: "sched:sched_migrate_task", + Format: "task migration", + Arguments: []string{}, + Duration: 2, + }, + { + ProbeType: "t", // tracepoint + Target: "sched:sched_pi_setprio", + Format: "priority change", + Arguments: []string{}, + Duration: 2, + }, + // === MEMORY MANAGEMENT === + { + ProbeType: "t", // tracepoint + Target: "syscalls:sys_enter_brk", + Format: "memory allocation: brk", + Arguments: []string{}, + Duration: 2, + }, + // === KERNEL MEMORY TRACING === + { + ProbeType: "t", // tracepoint + Target: "kmem:kfree", + Format: "kernel memory free", + Arguments: []string{}, + Duration: 2, + }, + } + + for i, testSpec := range testSpecs { + category := "unknown" + if strings.Contains(testSpec.Target, "sys_write") || strings.Contains(testSpec.Target, "sys_read") { + category = "filesystem" + } else if strings.Contains(testSpec.Target, "sys_connect") || strings.Contains(testSpec.Target, "sys_accept") { + category = "network" + } else if strings.Contains(testSpec.Target, "block:") { + category = "disk I/O" + } else if strings.Contains(testSpec.Target, "sched:") { + category = "CPU/scheduler" + } else if strings.Contains(testSpec.Target, "sys_brk") || strings.Contains(testSpec.Target, "kmem:") { + category = "memory" + } + + fmt.Printf("\n ๐Ÿ” Test %d: [%s] Tracing %s for %d seconds\n", i+1, category, testSpec.Target, testSpec.Duration) + + testTraceID, err := manager.StartTrace(testSpec) + if err != nil { + fmt.Printf(" โŒ Failed to start: %v\n", err) + continue + } + + // Generate activity specific to this trace type + go func(target, probeType string) { + time.Sleep(500 * time.Millisecond) + switch { + case strings.Contains(target, "sys_write") || strings.Contains(target, "sys_read"): + // Generate file I/O + for j := 0; j < 3; j++ { + testFile := fmt.Sprintf("/tmp/io_test_%d.txt", j) + if file, err := os.Create(testFile); err == nil { + file.WriteString("BCC tracing test data for I/O operations") + file.Sync() + file.Close() + + // Read the file back + if readFile, err := os.Open(testFile); err == nil { + buffer := make([]byte, 1024) + readFile.Read(buffer) + readFile.Close() + } + os.Remove(testFile) + } + time.Sleep(200 * time.Millisecond) + } + case strings.Contains(target, "block:"): + // Generate disk I/O to trigger block layer events + for j := 0; j < 3; j++ { + testFile := fmt.Sprintf("/tmp/block_test_%d.txt", j) + if file, err := os.Create(testFile); err == nil { + // Write substantial data to trigger block I/O + data := make([]byte, 1024*4) // 4KB + for k := range data { + data[k] = byte(k % 256) + } + file.Write(data) + file.Sync() // Force write to disk + file.Close() + } + os.Remove(testFile) + time.Sleep(300 * time.Millisecond) + } + case strings.Contains(target, "sched:"): + // Generate CPU activity to trigger scheduler events + go func() { + for j := 0; j < 100; j++ { + // Create short-lived goroutines to trigger scheduler activity + go func() { + time.Sleep(time.Millisecond * 1) + }() + time.Sleep(time.Millisecond * 10) + } + }() + case strings.Contains(target, "sys_brk") || strings.Contains(target, "kmem:"): + // Generate memory allocation activity + for j := 0; j < 5; j++ { + // Allocate and free memory to trigger memory management + data := make([]byte, 1024*1024) // 1MB + for k := range data { + data[k] = byte(k % 256) + } + data = nil // Allow GC + time.Sleep(200 * time.Millisecond) + } + case strings.Contains(target, "sys_connect") || strings.Contains(target, "sys_accept"): + // Network operations (these may not generate events in test environment) + fmt.Printf(" Note: Network syscalls may not trigger events without actual network activity\n") + default: + // Generic activity + for j := 0; j < 3; j++ { + testFile := fmt.Sprintf("/tmp/generic_test_%d.txt", j) + if file, err := os.Create(testFile); err == nil { + file.WriteString("Generic test activity") + file.Close() + } + os.Remove(testFile) + time.Sleep(300 * time.Millisecond) + } + } + }(testSpec.Target, testSpec.ProbeType) + + // Wait for trace completion + time.Sleep(time.Duration(testSpec.Duration+1) * time.Second) + + testResult, err := manager.GetTraceResult(testTraceID) + if err != nil { + manager.StopTrace(testTraceID) + fmt.Printf(" โš ๏ธ Result error: %v\n", err) + continue + } + + fmt.Printf(" ๐Ÿ“Š Results for %s:\n", testSpec.Target) + fmt.Printf(" โ€ข Total events: %d\n", testResult.EventCount) + fmt.Printf(" โ€ข Events/sec: %.2f\n", testResult.Statistics.EventsPerSecond) + fmt.Printf(" โ€ข Duration: %v\n", testResult.EndTime.Sub(testResult.StartTime)) + + // Show process breakdown + if len(testResult.Statistics.TopProcesses) > 0 { + fmt.Printf(" โ€ข Top processes:\n") + for j, proc := range testResult.Statistics.TopProcesses { + if j >= 3 { // Show top 3 + break + } + fmt.Printf(" - %s: %d events (%.1f%%)\n", + proc.ProcessName, proc.EventCount, proc.Percentage) + } + } + + // Show sample events with PIDs, counts, etc. + if len(testResult.Events) > 0 { + fmt.Printf(" โ€ข Sample events:\n") + for j, event := range testResult.Events { + if j >= 5 { // Show first 5 events + break + } + fmt.Printf(" [%d] PID:%d TID:%d Process:%s Message:%s\n", + j+1, event.PID, event.TID, event.ProcessName, event.Message) + } + if len(testResult.Events) > 5 { + fmt.Printf(" ... and %d more events\n", len(testResult.Events)-5) + } + } + + if testResult.EventCount > 0 { + fmt.Printf(" โœ… Success: Captured %d real syscall events!\n", testResult.EventCount) + } else { + fmt.Printf(" โš ๏ธ No events captured (may be normal for this syscall)\n") + } + } + + fmt.Println("\n๐ŸŽ‰ BCC Trace Manager Root Test Complete!") + fmt.Println("โœ… Real eBPF tracing is working and ready for production use!") +} + +// TestAgentEBPFIntegration tests the agent's integration with BCC-style eBPF tracing +// This demonstrates the complete flow from agent to eBPF results +func TestAgentEBPFIntegration(t *testing.T) { + if os.Geteuid() != 0 { + t.Skip("โš ๏ธ Skipping agent integration test - requires root access") + return + } + + fmt.Println("\n=== Agent eBPF Integration Test ===") + fmt.Println("This test demonstrates the complete agent flow with BCC-style tracing") + + // Create agent with eBPF manager + agent := &LinuxDiagnosticAgent{} + agent.ebpfManager = NewBCCTraceManager() + agent.config = DefaultAgentConfig() // Add config for concurrent execution + + // Test multiple syscalls that would be sent by remote API + testEBPFRequests := []EBPFRequest{ + { + Name: "file_operations", + Type: "syscall", + Target: "sys_openat", // Will be converted to __x64_sys_openat + Duration: 3, + Description: "trace file open operations", + Filters: map[string]string{}, + }, + { + Name: "network_operations", + Type: "syscall", + Target: "__x64_sys_connect", + Duration: 2, + Description: "trace network connections", + Filters: map[string]string{}, + }, + { + Name: "io_operations", + Type: "syscall", + Target: "sys_write", + Duration: 2, + Description: "trace write operations", + Filters: map[string]string{}, + }, + } + + fmt.Printf("๐Ÿš€ Testing agent with %d eBPF programs...\n\n", len(testEBPFRequests)) + + // Execute eBPF programs through agent (simulating API call) + traceSpecs := agent.convertEBPFProgramsToTraceSpecs(testEBPFRequests) + results := agent.executeBCCTracesConcurrently(traceSpecs) + + fmt.Printf("๐Ÿ“Š Agent eBPF Execution Results:\n") + fmt.Printf("=" + strings.Repeat("=", 50) + "\n\n") + + for i, result := range results { + fmt.Printf("๐Ÿ” Program %d: %s\n", i+1, result["name"]) + fmt.Printf(" Target: %s\n", result["target"]) + fmt.Printf(" Type: %s\n", result["type"]) + fmt.Printf(" Status: %s\n", result["status"]) + fmt.Printf(" Success: %v\n", result["success"]) + + if result["success"].(bool) { + if eventCount, ok := result["event_count"].(int); ok { + fmt.Printf(" Events captured: %d\n", eventCount) + } + if dataPoints, ok := result["data_points"].(int); ok { + fmt.Printf(" Data points: %d\n", dataPoints) + } + if summary, ok := result["summary"].(string); ok { + fmt.Printf(" Summary: %s\n", summary) + } + + // Show events if available + if events, ok := result["events"].([]TraceEvent); ok && len(events) > 0 { + fmt.Printf(" Sample events:\n") + for j, event := range events { + if j >= 3 { // Show first 3 + break + } + fmt.Printf(" [%d] PID:%d Process:%s Message:%s\n", + j+1, event.PID, event.ProcessName, event.Message) + } + if len(events) > 3 { + fmt.Printf(" ... and %d more events\n", len(events)-3) + } + } + + // Show statistics if available + if stats, ok := result["statistics"].(TraceStats); ok { + fmt.Printf(" Statistics:\n") + fmt.Printf(" - Events/sec: %.2f\n", stats.EventsPerSecond) + fmt.Printf(" - Total processes: %d\n", len(stats.EventsByProcess)) + if len(stats.TopProcesses) > 0 { + fmt.Printf(" - Top process: %s (%d events)\n", + stats.TopProcesses[0].ProcessName, stats.TopProcesses[0].EventCount) + } + } + } else { + if errMsg, ok := result["error"].(string); ok { + fmt.Printf(" Error: %s\n", errMsg) + } + } + fmt.Println() + } + + // Validate expected agent response format + t.Run("ValidateAgentResponseFormat", func(t *testing.T) { + for i, result := range results { + // Check required fields + requiredFields := []string{"name", "type", "target", "duration", "description", "status", "success"} + for _, field := range requiredFields { + if _, exists := result[field]; !exists { + t.Errorf("Result %d missing required field: %s", i, field) + } + } + + // If successful, check for data fields + if success, ok := result["success"].(bool); ok && success { + // Should have either event_count or data_points + hasEventCount := false + hasDataPoints := false + + if _, ok := result["event_count"]; ok { + hasEventCount = true + } + if _, ok := result["data_points"]; ok { + hasDataPoints = true + } + + if !hasEventCount && !hasDataPoints { + t.Errorf("Successful result %d should have event_count or data_points", i) + } + } + } + }) + + fmt.Println("โœ… Agent eBPF Integration Test Complete!") + fmt.Println("๐Ÿ“ˆ The agent correctly processes eBPF requests and returns detailed syscall data!") +} diff --git a/go.mod b/go.mod index 22f8b94..7e330ef 100644 --- a/go.mod +++ b/go.mod @@ -5,7 +5,7 @@ go 1.23.0 toolchain go1.24.2 require ( - github.com/cilium/ebpf v0.19.0 + github.com/gorilla/websocket v1.5.3 github.com/joho/godotenv v1.5.1 github.com/sashabaranov/go-openai v1.32.0 github.com/shirou/gopsutil/v3 v3.24.5 @@ -13,7 +13,6 @@ require ( require ( github.com/go-ole/go-ole v1.2.6 // indirect - github.com/gorilla/websocket v1.5.3 // indirect github.com/lufia/plan9stats v0.0.0-20211012122336-39d0f177ccd0 // indirect github.com/power-devops/perfstat v0.0.0-20210106213030-5aafc221ea8c // indirect github.com/shoenig/go-m1cpu v0.1.6 // indirect diff --git a/go.sum b/go.sum index 24815e2..49dc27e 100644 --- a/go.sum +++ b/go.sum @@ -1,11 +1,7 @@ -github.com/cilium/ebpf v0.19.0 h1:Ro/rE64RmFBeA9FGjcTc+KmCeY6jXmryu6FfnzPRIao= -github.com/cilium/ebpf v0.19.0/go.mod h1:fLCgMo3l8tZmAdM3B2XqdFzXBpwkcSTroaVqN08OWVY= github.com/davecgh/go-spew v1.1.1 h1:vj9j/u1bqnvCEfJOwUhtlOARqs3+rkHYY13jYWTU97c= github.com/davecgh/go-spew v1.1.1/go.mod h1:J7Y8YcW2NihsgmVo/mv3lAwl/skON4iLHjSsI+c5H38= github.com/go-ole/go-ole v1.2.6 h1:/Fpf6oFPoeFik9ty7siob0G6Ke8QvQEuVcuChpwXzpY= github.com/go-ole/go-ole v1.2.6/go.mod h1:pprOEPIfldk/42T2oK7lQ4v4JSDwmV0As9GaiUsvbm0= -github.com/go-quicktest/qt v1.101.1-0.20240301121107-c6c8733fa1e6 h1:teYtXy9B7y5lHTp8V9KPxpYRAVA7dozigQcMiBust1s= -github.com/go-quicktest/qt v1.101.1-0.20240301121107-c6c8733fa1e6/go.mod h1:p4lGIVX+8Wa6ZPNDvqcxq36XpUDLh42FLetFU7odllI= github.com/google/go-cmp v0.5.6/go.mod h1:v8dTdLbMG2kIc/vJvl+f65V22dbkXbowE6jgT/gNBxE= github.com/google/go-cmp v0.6.0 h1:ofyhxvXcZhMsU5ulbFiLKl/XBFqE1GSq7atu8tAmTRI= github.com/google/go-cmp v0.6.0/go.mod h1:17dUlkBOakJ0+DkrSSNjCkIjxS6bF9zb3elmeNGIjoY= @@ -13,26 +9,12 @@ github.com/gorilla/websocket v1.5.3 h1:saDtZ6Pbx/0u+bgYQ3q96pZgCzfhKXGPqt7kZ72aN github.com/gorilla/websocket v1.5.3/go.mod h1:YR8l580nyteQvAITg2hZ9XVh4b55+EU/adAjf1fMHhE= github.com/joho/godotenv v1.5.1 h1:7eLL/+HRGLY0ldzfGMeQkb7vMd0as4CfYvUVzLqw0N0= github.com/joho/godotenv v1.5.1/go.mod h1:f4LDr5Voq0i2e/R5DDNOoa2zzDfwtkZa6DnEwAbqwq4= -github.com/josharian/native v1.1.0 h1:uuaP0hAbW7Y4l0ZRQ6C9zfb7Mg1mbFKry/xzDAfmtLA= -github.com/josharian/native v1.1.0/go.mod h1:7X/raswPFr05uY3HiLlYeyQntB6OO7E/d2Cu7qoaN2w= -github.com/jsimonetti/rtnetlink/v2 v2.0.1 h1:xda7qaHDSVOsADNouv7ukSuicKZO7GgVUCXxpaIEIlM= -github.com/jsimonetti/rtnetlink/v2 v2.0.1/go.mod h1:7MoNYNbb3UaDHtF8udiJo/RH6VsTKP1pqKLUTVCvToE= -github.com/kr/pretty v0.3.1 h1:flRD4NNwYAUpkphVc1HcthR4KEIFJ65n8Mw5qdRn3LE= -github.com/kr/pretty v0.3.1/go.mod h1:hoEshYVHaxMs3cyo3Yncou5ZscifuDolrwPKZanG3xk= -github.com/kr/text v0.2.0 h1:5Nx0Ya0ZqY2ygV366QzturHI13Jq95ApcVaJBhpS+AY= -github.com/kr/text v0.2.0/go.mod h1:eLer722TekiGuMkidMxC/pM04lWEeraHUUmBw8l2grE= github.com/lufia/plan9stats v0.0.0-20211012122336-39d0f177ccd0 h1:6E+4a0GO5zZEnZ81pIr0yLvtUWk2if982qA3F3QD6H4= github.com/lufia/plan9stats v0.0.0-20211012122336-39d0f177ccd0/go.mod h1:zJYVVT2jmtg6P3p1VtQj7WsuWi/y4VnjVBn7F8KPB3I= -github.com/mdlayher/netlink v1.7.2 h1:/UtM3ofJap7Vl4QWCPDGXY8d3GIY2UGSDbK+QWmY8/g= -github.com/mdlayher/netlink v1.7.2/go.mod h1:xraEF7uJbxLhc5fpHL4cPe221LI2bdttWlU+ZGLfQSw= -github.com/mdlayher/socket v0.4.1 h1:eM9y2/jlbs1M615oshPQOHZzj6R6wMT7bX5NPiQvn2U= -github.com/mdlayher/socket v0.4.1/go.mod h1:cAqeGjoufqdxWkD7DkpyS+wcefOtmu5OQ8KuoJGIReA= github.com/pmezard/go-difflib v1.0.0 h1:4DBwDE0NGyQoBHbLQYPwSUPoCMWR5BEzIk/f1lZbAQM= github.com/pmezard/go-difflib v1.0.0/go.mod h1:iKH77koFhYxTK1pcRnkKkqfTogsbg7gZNVY4sRDYZ/4= github.com/power-devops/perfstat v0.0.0-20210106213030-5aafc221ea8c h1:ncq/mPwQF4JjgDlrVEn3C11VoGHZN7m8qihwgMEtzYw= github.com/power-devops/perfstat v0.0.0-20210106213030-5aafc221ea8c/go.mod h1:OmDBASR4679mdNQnz2pUhc2G8CO2JrUAVFDRBDP/hJE= -github.com/rogpeppe/go-internal v1.12.0 h1:exVL4IDcn6na9z1rAb56Vxr+CgyK3nn3O+epU5NdKM8= -github.com/rogpeppe/go-internal v1.12.0/go.mod h1:E+RYuTGaKKdloAfM02xzb0FW3Paa99yedzYV+kq4uf4= github.com/sashabaranov/go-openai v1.32.0 h1:Yk3iE9moX3RBXxrof3OBtUBrE7qZR0zF9ebsoO4zVzI= github.com/sashabaranov/go-openai v1.32.0/go.mod h1:lj5b/K+zjTSFxVLijLSTDZuP7adOgerWeFyZLUhAKRg= github.com/shirou/gopsutil/v3 v3.24.5 h1:i0t8kL+kQTvpAYToeuiVk3TgDeKOFioZO3Ztz/iZ9pI= @@ -49,10 +31,6 @@ github.com/tklauser/numcpus v0.6.1 h1:ng9scYS7az0Bk4OZLvrNXNSAO2Pxr1XXRAPyjhIx+F github.com/tklauser/numcpus v0.6.1/go.mod h1:1XfjsgE2zo8GVw7POkMbHENHzVg3GzmoZ9fESEdAacY= github.com/yusufpapurcu/wmi v1.2.4 h1:zFUKzehAFReQwLys1b/iSMl+JQGSCSjtVqQn9bBrPo0= github.com/yusufpapurcu/wmi v1.2.4/go.mod h1:SBZ9tNy3G9/m5Oi98Zks0QjeHVDvuK0qfxQmPyzfmi0= -golang.org/x/net v0.38.0 h1:vRMAPTMaeGqVhG5QyLJHqNDwecKTomGeqbnfZyKlBI8= -golang.org/x/net v0.38.0/go.mod h1:ivrbrMbzFq5J41QOQh0siUuly180yBYtLp+CKbEaFx8= -golang.org/x/sync v0.1.0 h1:wsuoTGHzEhffawBOhz5CYhcrV4IdKZbEyZjBMuTp12o= -golang.org/x/sync v0.1.0/go.mod h1:RxMgew5VJxzue5/jJTE5uejpjVlOe/izrB70Jof72aM= golang.org/x/sys v0.0.0-20190916202348-b4ddaad3f8a3/go.mod h1:h1NjWce9XRLGQEsW7wpKNCjG9DtNlClVuFLEZdDNbEs= golang.org/x/sys v0.0.0-20201204225414-ed752295db88/go.mod h1:h1NjWce9XRLGQEsW7wpKNCjG9DtNlClVuFLEZdDNbEs= golang.org/x/sys v0.8.0/go.mod h1:oPkhp1MJrh7nUepCBck5+mAzfO9JrbApNNgaTdGDITg= diff --git a/internal/auth/auth.go b/internal/auth/auth.go index 5a2bbff..85551cf 100644 --- a/internal/auth/auth.go +++ b/internal/auth/auth.go @@ -13,6 +13,7 @@ import ( "time" "nannyagentv2/internal/config" + "nannyagentv2/internal/logging" "nannyagentv2/internal/types" ) @@ -103,7 +104,7 @@ func (am *AuthManager) StartDeviceAuthorization() (*types.DeviceAuthResponse, er // PollForToken polls the token endpoint until authorization is complete func (am *AuthManager) PollForToken(deviceCode string) (*types.TokenResponse, error) { - fmt.Println("โณ Waiting for user authorization...") + logging.Info("Waiting for user authorization...") for attempts := 0; attempts < MaxPollAttempts; attempts++ { tokenReq := types.TokenRequest{ @@ -151,7 +152,7 @@ func (am *AuthManager) PollForToken(deviceCode string) (*types.TokenResponse, er } if tokenResp.AccessToken != "" { - fmt.Println("\nโœ… Authorization successful!") + logging.Info("Authorization successful!") return &tokenResp, nil } @@ -230,7 +231,7 @@ func (am *AuthManager) SaveToken(token *types.AuthToken) error { refreshTokenPath := filepath.Join(TokenStorageDir, RefreshTokenFile) if err := os.WriteFile(refreshTokenPath, []byte(token.RefreshToken), 0600); err != nil { // Don't fail if refresh token backup fails, just log - fmt.Printf("Warning: Failed to save backup refresh token: %v\n", err) + logging.Warning("Failed to save backup refresh token: %v", err) } } @@ -271,8 +272,8 @@ func (am *AuthManager) RegisterDevice() (*types.AuthToken, error) { return nil, fmt.Errorf("failed to start device authorization: %w", err) } - fmt.Printf("Please visit: %s\n", deviceAuth.VerificationURI) - fmt.Printf("And enter code: %s\n", deviceAuth.UserCode) + logging.Info("Please visit: %s", deviceAuth.VerificationURI) + logging.Info("And enter code: %s", deviceAuth.UserCode) // Step 2: Poll for token tokenResp, err := am.PollForToken(deviceAuth.DeviceCode) @@ -318,13 +319,13 @@ func (am *AuthManager) EnsureAuthenticated() (*types.AuthToken, error) { // Try to load refresh token from backup file if backupRefreshToken, backupErr := am.loadRefreshTokenFromBackup(); backupErr == nil { refreshToken = backupRefreshToken - fmt.Println("๐Ÿ”„ Found backup refresh token, attempting to use it...") + logging.Debug("Found backup refresh token, attempting to use it...") } } } if refreshToken != "" { - fmt.Println("๐Ÿ”„ Attempting to refresh access token...") + logging.Debug("Attempting to refresh access token...") refreshResp, refreshErr := am.RefreshAccessToken(refreshToken) if refreshErr == nil { diff --git a/internal/config/config.go b/internal/config/config.go index 2229bbb..26ba0e8 100644 --- a/internal/config/config.go +++ b/internal/config/config.go @@ -6,6 +6,8 @@ import ( "path/filepath" "strings" + "nannyagentv2/internal/logging" + "github.com/joho/godotenv" ) @@ -39,9 +41,9 @@ func LoadConfig() (*Config, error) { envFile := findEnvFile() if envFile != "" { if err := godotenv.Load(envFile); err != nil { - fmt.Printf("Warning: Could not load .env file from %s: %v\n", envFile, err) + logging.Warning("Could not load .env file from %s: %v", envFile, err) } else { - fmt.Printf("Loaded configuration from %s\n", envFile) + logging.Info("Loaded configuration from %s", envFile) } } @@ -124,8 +126,8 @@ func (c *Config) PrintConfig() { return } - fmt.Println("Configuration:") - fmt.Printf(" Supabase Project URL: %s\n", c.SupabaseProjectURL) - fmt.Printf(" Metrics Interval: %d seconds\n", c.MetricsInterval) - fmt.Printf(" Debug: %v\n", c.Debug) + logging.Debug("Configuration:") + logging.Debug(" Supabase Project URL: %s", c.SupabaseProjectURL) + logging.Debug(" Metrics Interval: %d seconds", c.MetricsInterval) + logging.Debug(" Debug: %v", c.Debug) } diff --git a/internal/logging/logger.go b/internal/logging/logger.go index 2130c6a..14e0008 100644 --- a/internal/logging/logger.go +++ b/internal/logging/logger.go @@ -5,11 +5,39 @@ import ( "log" "log/syslog" "os" + "strings" ) +// LogLevel defines the logging level +type LogLevel int + +const ( + LevelDebug LogLevel = iota + LevelInfo + LevelWarning + LevelError +) + +func (l LogLevel) String() string { + switch l { + case LevelDebug: + return "DEBUG" + case LevelInfo: + return "INFO" + case LevelWarning: + return "WARN" + case LevelError: + return "ERROR" + default: + return "INFO" + } +} + +// Logger provides structured logging with configurable levels type Logger struct { syslogWriter *syslog.Writer - debugMode bool + level LogLevel + showEmoji bool } var defaultLogger *Logger @@ -18,9 +46,16 @@ func init() { defaultLogger = NewLogger() } +// NewLogger creates a new logger with default configuration func NewLogger() *Logger { + return NewLoggerWithLevel(getLogLevelFromEnv()) +} + +// NewLoggerWithLevel creates a logger with specified level +func NewLoggerWithLevel(level LogLevel) *Logger { l := &Logger{ - debugMode: os.Getenv("DEBUG") == "true", + level: level, + showEmoji: os.Getenv("LOG_NO_EMOJI") != "true", } // Try to connect to syslog @@ -31,39 +66,87 @@ func NewLogger() *Logger { return l } -func (l *Logger) Info(format string, args ...interface{}) { - msg := fmt.Sprintf(format, args...) - if l.syslogWriter != nil { - l.syslogWriter.Info(msg) +// getLogLevelFromEnv parses log level from environment variable +func getLogLevelFromEnv() LogLevel { + level := strings.ToUpper(os.Getenv("LOG_LEVEL")) + switch level { + case "DEBUG": + return LevelDebug + case "INFO", "": + return LevelInfo + case "WARN", "WARNING": + return LevelWarning + case "ERROR": + return LevelError + default: + return LevelInfo } - log.Printf("[INFO] %s", msg) +} + +// logMessage handles the actual logging +func (l *Logger) logMessage(level LogLevel, format string, args ...interface{}) { + if level < l.level { + return + } + + msg := fmt.Sprintf(format, args...) + prefix := fmt.Sprintf("[%s]", level.String()) + + // Add emoji prefix if enabled + if l.showEmoji { + switch level { + case LevelDebug: + prefix = "๐Ÿ” " + prefix + case LevelInfo: + prefix = "โ„น๏ธ " + prefix + case LevelWarning: + prefix = "โš ๏ธ " + prefix + case LevelError: + prefix = "โŒ " + prefix + } + } + + // Log to syslog if available + if l.syslogWriter != nil { + switch level { + case LevelDebug: + l.syslogWriter.Debug(msg) + case LevelInfo: + l.syslogWriter.Info(msg) + case LevelWarning: + l.syslogWriter.Warning(msg) + case LevelError: + l.syslogWriter.Err(msg) + } + } + + log.Printf("%s %s", prefix, msg) } func (l *Logger) Debug(format string, args ...interface{}) { - if !l.debugMode { - return - } - msg := fmt.Sprintf(format, args...) - if l.syslogWriter != nil { - l.syslogWriter.Debug(msg) - } - log.Printf("[DEBUG] %s", msg) + l.logMessage(LevelDebug, format, args...) +} + +func (l *Logger) Info(format string, args ...interface{}) { + l.logMessage(LevelInfo, format, args...) } func (l *Logger) Warning(format string, args ...interface{}) { - msg := fmt.Sprintf(format, args...) - if l.syslogWriter != nil { - l.syslogWriter.Warning(msg) - } - log.Printf("[WARNING] %s", msg) + l.logMessage(LevelWarning, format, args...) } func (l *Logger) Error(format string, args ...interface{}) { - msg := fmt.Sprintf(format, args...) - if l.syslogWriter != nil { - l.syslogWriter.Err(msg) - } - log.Printf("[ERROR] %s", msg) + l.logMessage(LevelError, format, args...) +} + +// SetLevel changes the logging level +func (l *Logger) SetLevel(level LogLevel) { + l.level = level +} + +// GetLevel returns current logging level +func (l *Logger) GetLevel() LogLevel { + return l.level } func (l *Logger) Close() { @@ -73,14 +156,14 @@ func (l *Logger) Close() { } // Global logging functions -func Info(format string, args ...interface{}) { - defaultLogger.Info(format, args...) -} - func Debug(format string, args ...interface{}) { defaultLogger.Debug(format, args...) } +func Info(format string, args ...interface{}) { + defaultLogger.Info(format, args...) +} + func Warning(format string, args ...interface{}) { defaultLogger.Warning(format, args...) } @@ -88,3 +171,13 @@ func Warning(format string, args ...interface{}) { func Error(format string, args ...interface{}) { defaultLogger.Error(format, args...) } + +// SetLevel sets the global logger level +func SetLevel(level LogLevel) { + defaultLogger.SetLevel(level) +} + +// GetLevel gets the global logger level +func GetLevel() LogLevel { + return defaultLogger.GetLevel() +} diff --git a/internal/types/types.go b/internal/types/types.go index 49aa2ef..d026aa6 100644 --- a/internal/types/types.go +++ b/internal/types/types.go @@ -241,7 +241,7 @@ type CommandResult struct { type EBPFEnhancedDiagnosticResponse struct { ResponseType string `json:"response_type"` Reasoning string `json:"reasoning"` - Commands []Command `json:"commands"` + Commands []string `json:"commands"` // Changed to []string to match current prompt format EBPFPrograms []EBPFRequest `json:"ebpf_programs"` NextActions []string `json:"next_actions,omitempty"` } diff --git a/investigation_server.go b/investigation_server.go index edacb62..f2ed692 100644 --- a/investigation_server.go +++ b/investigation_server.go @@ -9,6 +9,7 @@ import ( "time" "nannyagentv2/internal/auth" + "nannyagentv2/internal/logging" "nannyagentv2/internal/metrics" "github.com/sashabaranov/go-openai" @@ -62,7 +63,7 @@ func NewInvestigationServer(agent *LinuxDiagnosticAgent, authManager *auth.AuthM agentID = id } else { - fmt.Printf("โŒ Failed to get agent ID from auth manager: %v\n", err) + logging.Error("Failed to get agent ID from auth manager: %v", err) } } @@ -117,9 +118,9 @@ func (s *InvestigationServer) Start() error { // Start realtime polling for backend-initiated investigations if s.supabaseURL != "" && s.authManager != nil { go s.startRealtimePolling() - fmt.Printf("๐Ÿ”„ Realtime investigation polling enabled\n") + logging.Info("Realtime investigation polling enabled") } else { - fmt.Printf("โš ๏ธ Realtime investigation polling disabled (missing Supabase config or auth)\n") + logging.Warning("Realtime investigation polling disabled (missing Supabase config or auth)") } server := &http.Server{ @@ -129,7 +130,7 @@ func (s *InvestigationServer) Start() error { WriteTimeout: 30 * time.Second, } - fmt.Printf("๐Ÿ” Investigation server started on port %s (Agent ID: %s)\n", s.port, s.agentID) + logging.Info("Investigation server started on port %s (Agent ID: %s)", s.port, s.agentID) return server.ListenAndServe() } @@ -221,7 +222,7 @@ func (s *InvestigationServer) sendCommandResultsToTensorZero(diagnosticResp Diag }) // Send to TensorZero via application agent's sendRequest method - fmt.Printf("๐Ÿ”„ Sending command results to TensorZero for analysis...\n") + logging.Debug("Sending command results to TensorZero for analysis") response, err := s.applicationAgent.sendRequest(messages) if err != nil { return nil, fmt.Errorf("failed to send request to TensorZero: %w", err) @@ -232,7 +233,7 @@ func (s *InvestigationServer) sendCommandResultsToTensorZero(diagnosticResp Diag } content := response.Choices[0].Message.Content - fmt.Printf("๐Ÿค– TensorZero continued analysis:\n%s\n", content) + logging.Debug("TensorZero continued analysis: %s", content) // Try to parse the response to determine if it's diagnostic or resolution var diagnosticNextResp DiagnosticResponse @@ -240,7 +241,7 @@ func (s *InvestigationServer) sendCommandResultsToTensorZero(diagnosticResp Diag // Check if it's another diagnostic response if err := json.Unmarshal([]byte(content), &diagnosticNextResp); err == nil && diagnosticNextResp.ResponseType == "diagnostic" { - fmt.Printf("๐Ÿ”„ TensorZero requests %d more commands\n", len(diagnosticNextResp.Commands)) + logging.Debug("TensorZero requests %d more commands", len(diagnosticNextResp.Commands)) return map[string]interface{}{ "type": "diagnostic", "response": diagnosticNextResp, @@ -295,7 +296,7 @@ func (s *InvestigationServer) handleInvestigation(w http.ResponseWriter, r *http return } - fmt.Printf("๐Ÿ“‹ Received investigation payload with response_type: %s\n", responseType) + logging.Debug("Received investigation payload with response_type: %s", responseType) switch responseType { case "diagnostic": diff --git a/main.go b/main.go index 94fe8ad..1f15a78 100644 --- a/main.go +++ b/main.go @@ -13,6 +13,7 @@ import ( "nannyagentv2/internal/auth" "nannyagentv2/internal/config" + "nannyagentv2/internal/logging" "nannyagentv2/internal/metrics" "nannyagentv2/internal/types" ) @@ -22,12 +23,9 @@ const Version = "v2.0.0" // checkRootPrivileges ensures the program is running as root func checkRootPrivileges() { if os.Geteuid() != 0 { - fmt.Fprintf(os.Stderr, "โŒ ERROR: This program must be run as root for eBPF functionality.\n") - fmt.Fprintf(os.Stderr, "Please run with: sudo %s\n", os.Args[0]) - fmt.Fprintf(os.Stderr, "Reason: eBPF programs require root privileges to:\n") - fmt.Fprintf(os.Stderr, " - Load programs into the kernel\n") - fmt.Fprintf(os.Stderr, " - Attach to kernel functions and tracepoints\n") - fmt.Fprintf(os.Stderr, " - Access kernel memory maps\n") + logging.Error("This program must be run as root for eBPF functionality") + logging.Error("Please run with: sudo %s", os.Args[0]) + logging.Error("Reason: eBPF programs require root privileges to:\n - Load programs into the kernel\n - Attach to kernel functions and tracepoints\n - Access kernel memory maps") os.Exit(1) } } @@ -36,7 +34,7 @@ func checkRootPrivileges() { func checkKernelVersionCompatibility() { output, err := exec.Command("uname", "-r").Output() if err != nil { - fmt.Fprintf(os.Stderr, "โŒ ERROR: Cannot determine kernel version: %v\n", err) + logging.Error("Cannot determine kernel version: %v", err) os.Exit(1) } @@ -45,66 +43,57 @@ func checkKernelVersionCompatibility() { // Parse version (e.g., "5.15.0-56-generic" -> major=5, minor=15) parts := strings.Split(kernelVersion, ".") if len(parts) < 2 { - fmt.Fprintf(os.Stderr, "โŒ ERROR: Cannot parse kernel version: %s\n", kernelVersion) + logging.Error("Cannot parse kernel version: %s", kernelVersion) os.Exit(1) } major, err := strconv.Atoi(parts[0]) if err != nil { - fmt.Fprintf(os.Stderr, "โŒ ERROR: Cannot parse major kernel version: %s\n", parts[0]) + logging.Error("Cannot parse major kernel version: %s", parts[0]) os.Exit(1) } minor, err := strconv.Atoi(parts[1]) if err != nil { - fmt.Fprintf(os.Stderr, "โŒ ERROR: Cannot parse minor kernel version: %s\n", parts[1]) + logging.Error("Cannot parse minor kernel version: %s", parts[1]) os.Exit(1) } // Check if kernel is 4.4 or higher if major < 4 || (major == 4 && minor < 4) { - fmt.Fprintf(os.Stderr, "โŒ ERROR: Kernel version %s is too old for eBPF.\n", kernelVersion) - fmt.Fprintf(os.Stderr, "Required: Linux kernel 4.4 or higher\n") - fmt.Fprintf(os.Stderr, "Current: %s\n", kernelVersion) - fmt.Fprintf(os.Stderr, "Reason: eBPF requires kernel features introduced in 4.4+:\n") - fmt.Fprintf(os.Stderr, " - BPF system call support\n") - fmt.Fprintf(os.Stderr, " - eBPF program types (kprobe, tracepoint)\n") - fmt.Fprintf(os.Stderr, " - BPF maps and helper functions\n") + logging.Error("Kernel version %s is too old for eBPF", kernelVersion) + logging.Error("Required: Linux kernel 4.4 or higher") + logging.Error("Current: %s", kernelVersion) + logging.Error("Reason: eBPF requires kernel features introduced in 4.4+:\n - BPF system call support\n - eBPF program types (kprobe, tracepoint)\n - BPF maps and helper functions") os.Exit(1) } - } // checkEBPFSupport validates eBPF subsystem availability func checkEBPFSupport() { // Check if /sys/kernel/debug/tracing exists (debugfs mounted) if _, err := os.Stat("/sys/kernel/debug/tracing"); os.IsNotExist(err) { - fmt.Fprintf(os.Stderr, "โš ๏ธ WARNING: debugfs not mounted. Some eBPF features may not work.\n") - fmt.Fprintf(os.Stderr, "To fix: sudo mount -t debugfs debugfs /sys/kernel/debug\n") + logging.Warning("debugfs not mounted. Some eBPF features may not work") + logging.Info("To fix: sudo mount -t debugfs debugfs /sys/kernel/debug") } // Check if we can access BPF syscall fd, _, errno := syscall.Syscall(321, 0, 0, 0) // BPF syscall number on x86_64 if errno != 0 && errno != syscall.EINVAL { - fmt.Fprintf(os.Stderr, "โŒ ERROR: BPF syscall not available (errno: %v)\n", errno) - fmt.Fprintf(os.Stderr, "This may indicate:\n") - fmt.Fprintf(os.Stderr, " - Kernel compiled without BPF support\n") - fmt.Fprintf(os.Stderr, " - BPF syscall disabled in kernel config\n") + logging.Error("BPF syscall not available (errno: %v)", errno) + logging.Error("This may indicate:\n - Kernel compiled without BPF support\n - BPF syscall disabled in kernel config") os.Exit(1) } if fd > 0 { syscall.Close(int(fd)) } - } // runInteractiveDiagnostics starts the interactive diagnostic session func runInteractiveDiagnostics(agent *LinuxDiagnosticAgent) { - fmt.Println("") - fmt.Println("๐Ÿ” Linux eBPF-Enhanced Diagnostic Agent") - fmt.Println("=======================================") - fmt.Println("Linux Diagnostic Agent Started") - fmt.Println("Enter a system issue description (or 'quit' to exit):") + logging.Info("=== Linux eBPF-Enhanced Diagnostic Agent ===") + logging.Info("Linux Diagnostic Agent Started") + logging.Info("Enter a system issue description (or 'quit' to exit):") scanner := bufio.NewScanner(os.Stdin) for { @@ -124,7 +113,7 @@ func runInteractiveDiagnostics(agent *LinuxDiagnosticAgent) { // Process the issue with AI capabilities via TensorZero if err := agent.DiagnoseIssue(input); err != nil { - fmt.Printf("Error: %v\n", err) + logging.Error("Diagnosis failed: %v", err) } } @@ -132,19 +121,18 @@ func runInteractiveDiagnostics(agent *LinuxDiagnosticAgent) { log.Fatal(err) } - fmt.Println("Goodbye!") + logging.Info("Goodbye!") } func main() { - fmt.Printf("๐Ÿš€ NannyAgent v%s starting...\n", Version) + logging.Info("NannyAgent v%s starting...", Version) // Perform system compatibility checks first - fmt.Println("Performing system compatibility checks...") + logging.Info("Performing system compatibility checks...") checkRootPrivileges() checkKernelVersionCompatibility() checkEBPFSupport() - fmt.Println("โœ… All system checks passed") - fmt.Println("") + logging.Info("All system checks passed") // Load configuration cfg, err := config.LoadConfig() @@ -164,10 +152,10 @@ func main() { log.Fatalf("โŒ Authentication failed: %v", err) } - fmt.Println("โœ… Authentication successful!") + logging.Info("Authentication successful!") - // Initialize the diagnostic agent for interactive CLI use - agent := NewLinuxDiagnosticAgent() + // Initialize the diagnostic agent for interactive CLI use with authentication + agent := NewLinuxDiagnosticAgentWithAuth(authManager) // Initialize a separate agent for WebSocket investigations using the application model applicationAgent := NewLinuxDiagnosticAgent() @@ -177,53 +165,53 @@ func main() { wsClient := NewWebSocketClient(applicationAgent, authManager) go func() { if err := wsClient.Start(); err != nil { - log.Printf("โŒ WebSocket client error: %v", err) + logging.Error("WebSocket client error: %v", err) } }() // Start background metrics collection in a goroutine go func() { - fmt.Println("โค๏ธ Starting background metrics collection and heartbeat...") + logging.Debug("Starting background metrics collection and heartbeat...") ticker := time.NewTicker(time.Duration(cfg.MetricsInterval) * time.Second) defer ticker.Stop() // Send initial heartbeat if err := sendHeartbeat(cfg, token, metricsCollector); err != nil { - log.Printf("โš ๏ธ Initial heartbeat failed: %v", err) + logging.Warning("Initial heartbeat failed: %v", err) } // Main heartbeat loop for range ticker.C { // Check if token needs refresh if authManager.IsTokenExpired(token) { - fmt.Println("๐Ÿ”„ Token expiring soon, refreshing...") + logging.Debug("Token expiring soon, refreshing...") newToken, refreshErr := authManager.EnsureAuthenticated() if refreshErr != nil { - log.Printf("โŒ Token refresh failed: %v", refreshErr) + logging.Warning("Token refresh failed: %v", refreshErr) continue } token = newToken - fmt.Println("โœ… Token refreshed successfully") + logging.Debug("Token refreshed successfully") } // Send heartbeat if err := sendHeartbeat(cfg, token, metricsCollector); err != nil { - log.Printf("โš ๏ธ Heartbeat failed: %v", err) + logging.Warning("Heartbeat failed: %v", err) // If unauthorized, try to refresh token if err.Error() == "unauthorized" { - fmt.Println("๐Ÿ”„ Unauthorized, attempting token refresh...") + logging.Debug("Unauthorized, attempting token refresh...") newToken, refreshErr := authManager.EnsureAuthenticated() if refreshErr != nil { - log.Printf("โŒ Token refresh failed: %v", refreshErr) + logging.Warning("Token refresh failed: %v", refreshErr) continue } token = newToken // Retry heartbeat with new token (silently) if retryErr := sendHeartbeat(cfg, token, metricsCollector); retryErr != nil { - log.Printf("โš ๏ธ Retry heartbeat failed: %v", retryErr) + logging.Warning("Retry heartbeat failed: %v", retryErr) } } } diff --git a/scripts/debug_trace_script.sh b/scripts/debug_trace_script.sh new file mode 100755 index 0000000..bcc5e0f --- /dev/null +++ b/scripts/debug_trace_script.sh @@ -0,0 +1,19 @@ +#!/bin/bash + +# Test the current script generation +echo "Testing tracepoint script generation..." + +# Simulate what the failing test does +echo "Target: syscalls:sys_enter_openat" +echo "ProbeType: t" +echo "" +echo "Generated bpftrace script would be:" +echo "tracepoint:syscalls:sys_enter_openat {" +echo " printf(\"TRACE|%d|%d|%d|%s|syscalls:sys_enter_openat|file access\\n\", nsecs, pid, tid, comm, arg2@user);" +echo "}" +echo "" +echo "This is INVALID - should be:" +echo "tracepoint:syscalls:sys_enter_openat {" +echo " printf(\"TRACE|%d|%d|%d|%s|openat|file access\\n\", nsecs, pid, tid, comm);" +echo "}" + diff --git a/websocket_client.go b/websocket_client.go index e7f5bf7..831e677 100644 --- a/websocket_client.go +++ b/websocket_client.go @@ -13,7 +13,9 @@ import ( "time" "nannyagentv2/internal/auth" + "nannyagentv2/internal/logging" "nannyagentv2/internal/metrics" + "nannyagentv2/internal/types" "github.com/gorilla/websocket" "github.com/sashabaranov/go-openai" @@ -74,7 +76,7 @@ func NewWebSocketClient(agent *LinuxDiagnosticAgent, authManager *auth.AuthManag agentID = id // Agent ID retrieved successfully } else { - fmt.Printf("โŒ Failed to get agent ID from auth manager: %v\n", err) + logging.Error("Failed to get agent ID from auth manager: %v", err) } } @@ -178,7 +180,7 @@ func (c *WebSocketClient) connect() error { if err != nil { c.consecutiveFailures++ if c.consecutiveFailures >= 5 && resp != nil { - fmt.Printf("โŒ WebSocket handshake failed with status: %d (failure #%d)\n", resp.StatusCode, c.consecutiveFailures) + logging.Error("WebSocket handshake failed with status: %d (failure #%d)", resp.StatusCode, c.consecutiveFailures) } return fmt.Errorf("websocket connection failed: %v", err) } @@ -205,7 +207,7 @@ func (c *WebSocketClient) handleMessages() { case <-c.ctx.Done(): // Only log context cancellation if there have been failures if c.consecutiveFailures >= 5 { - fmt.Printf("๐Ÿ“ก Context cancelled after %v, stopping message handler\n", time.Since(connectionStart)) + logging.Debug("Context cancelled after %v, stopping message handler", time.Since(connectionStart)) } return default: @@ -223,14 +225,14 @@ func (c *WebSocketClient) handleMessages() { // Only log specific errors after failure threshold if c.consecutiveFailures >= 5 { if websocket.IsCloseError(err, websocket.CloseNormalClosure, websocket.CloseGoingAway) { - log.Printf("๐Ÿ”’ WebSocket closed normally after %v: %v", connectionDuration, err) + logging.Debug("WebSocket closed normally after %v: %v", connectionDuration, err) } else if websocket.IsUnexpectedCloseError(err, websocket.CloseGoingAway, websocket.CloseAbnormalClosure) { - log.Printf("๐Ÿ’ฅ ABNORMAL CLOSE after %v (code 1006 = server-side timeout/kill): %v", connectionDuration, err) - log.Printf("๐Ÿ•’ Last read took %v, connection lived %v", readDuration, connectionDuration) + logging.Error("ABNORMAL CLOSE after %v (code 1006 = server-side timeout/kill): %v", connectionDuration, err) + logging.Debug("Last read took %v, connection lived %v", readDuration, connectionDuration) } else if netErr, ok := err.(net.Error); ok && netErr.Timeout() { - log.Printf("โฐ READ TIMEOUT after %v: %v", connectionDuration, err) + logging.Warning("READ TIMEOUT after %v: %v", connectionDuration, err) } else { - log.Printf("โŒ WebSocket error after %v: %v", connectionDuration, err) + logging.Error("WebSocket error after %v: %v", connectionDuration, err) } } @@ -239,7 +241,7 @@ func (c *WebSocketClient) handleMessages() { // Only show diagnostics after multiple failures if c.consecutiveFailures >= 5 { - log.Printf("๐Ÿ” DIAGNOSTIC - Connection failed #%d after %v", c.consecutiveFailures, connectionDuration) + logging.Debug("DIAGNOSTIC - Connection failed #%d after %v", c.consecutiveFailures, connectionDuration) } // Attempt reconnection instead of returning immediately @@ -265,7 +267,7 @@ func (c *WebSocketClient) handleMessages() { // Task result acknowledged default: - log.Printf("โš ๏ธ Unknown message type: %s", message.Type) + logging.Warning("Unknown message type: %s", message.Type) } } } @@ -276,14 +278,14 @@ func (c *WebSocketClient) handleInvestigationTask(data interface{}) { // Parse task data taskBytes, err := json.Marshal(data) if err != nil { - log.Printf("โŒ Error marshaling task data: %v", err) + logging.Error("Error marshaling task data: %v", err) return } var task InvestigationTask err = json.Unmarshal(taskBytes, &task) if err != nil { - log.Printf("โŒ Error unmarshaling investigation task: %v", err) + logging.Error("Error unmarshaling investigation task: %v", err) return } @@ -300,7 +302,7 @@ func (c *WebSocketClient) handleInvestigationTask(data interface{}) { if err != nil { taskResult.Error = err.Error() - fmt.Printf("โŒ Task execution failed: %v\n", err) + logging.Error("Task execution failed: %v", err) } else { taskResult.CommandResults = results // Task executed successfully @@ -356,7 +358,7 @@ func (c *WebSocketClient) executeDiagnosticCommands(diagnosticPayload map[string if err != nil { result["error"] = err.Error() - fmt.Printf("โŒ Command [%s] failed: %v (exit code: %d)\n", id, err, exitCode) + logging.Warning("Command [%s] failed: %v (exit code: %d)", id, err, exitCode) } commandResults = append(commandResults, result) @@ -379,7 +381,7 @@ func (c *WebSocketClient) executeDiagnosticCommands(diagnosticPayload map[string // executeEBPFPrograms executes eBPF monitoring programs using the real eBPF manager func (c *WebSocketClient) executeEBPFPrograms(ebpfPrograms []interface{}) []map[string]interface{} { - var ebpfRequests []EBPFRequest + var ebpfRequests []types.EBPFRequest // Convert interface{} to EBPFRequest structs for _, prog := range ebpfPrograms { @@ -398,7 +400,7 @@ func (c *WebSocketClient) executeEBPFPrograms(ebpfPrograms []interface{}) []map[ continue } - ebpfRequests = append(ebpfRequests, EBPFRequest{ + ebpfRequests = append(ebpfRequests, types.EBPFRequest{ Name: name, Type: progType, Target: target, @@ -444,7 +446,7 @@ func (c *WebSocketClient) executeCommandsFromPayload(commands []interface{}) []m if err != nil { result["error"] = err.Error() - fmt.Printf("โŒ Command [%s] failed: %v (exit code: %d)\n", id, err, exitCode) + logging.Warning("Command [%s] failed: %v (exit code: %d)", id, err, exitCode) } commandResults = append(commandResults, result) @@ -502,7 +504,7 @@ func (c *WebSocketClient) sendTaskResult(result TaskResult) { err := c.conn.WriteJSON(message) if err != nil { - log.Printf("โŒ Error sending task result: %v", err) + logging.Error("Error sending task result: %v", err) } } @@ -516,7 +518,7 @@ func (c *WebSocketClient) startHeartbeat() { for { select { case <-c.ctx.Done(): - fmt.Printf("๐Ÿ’“ Heartbeat stopped due to context cancellation\n") + logging.Debug("Heartbeat stopped due to context cancellation") return case <-ticker.C: // Sending heartbeat @@ -531,8 +533,8 @@ func (c *WebSocketClient) startHeartbeat() { err := c.conn.WriteJSON(heartbeat) if err != nil { - log.Printf("โŒ Error sending heartbeat: %v", err) - fmt.Printf("๐Ÿ’“ Heartbeat failed, connection likely dead\n") + logging.Error("Error sending heartbeat: %v", err) + logging.Debug("Heartbeat failed, connection likely dead") return } // Heartbeat sent @@ -656,14 +658,14 @@ func (c *WebSocketClient) handlePendingInvestigation(investigation PendingInvest for { tzResp, tzErr := c.agent.sendRequestWithEpisode(messages, episodeID) if tzErr != nil { - fmt.Printf("โš ๏ธ TensorZero continuation failed: %v\n", tzErr) + logging.Warning("TensorZero continuation failed: %v", tzErr) // Fall back to marking completed with command results only c.updateInvestigationStatus(investigation.ID, "completed", resultsForDB, nil) return } if len(tzResp.Choices) == 0 { - fmt.Printf("โš ๏ธ No choices in TensorZero response\n") + logging.Warning("No choices in TensorZero response") c.updateInvestigationStatus(investigation.ID, "completed", resultsForDB, nil) return } @@ -672,7 +674,7 @@ func (c *WebSocketClient) handlePendingInvestigation(investigation PendingInvest if len(aiContent) > 300 { // AI response received successfully } else { - fmt.Printf("๐Ÿค– AI Response: %s\n", aiContent) + logging.Debug("AI Response: %s", aiContent) } // Check if this is a resolution response (final) @@ -683,14 +685,14 @@ func (c *WebSocketClient) handlePendingInvestigation(investigation PendingInvest Confidence string `json:"confidence"` } - fmt.Printf("๐Ÿ” Analyzing AI response type...\n") + logging.Debug("Analyzing AI response type...") if err := json.Unmarshal([]byte(aiContent), &resolutionResp); err == nil && resolutionResp.ResponseType == "resolution" { // This is the final resolution - show summary and complete - fmt.Printf("\n=== DIAGNOSIS COMPLETE ===\n") - fmt.Printf("Root Cause: %s\n", resolutionResp.RootCause) - fmt.Printf("Resolution Plan: %s\n", resolutionResp.ResolutionPlan) - fmt.Printf("Confidence: %s\n", resolutionResp.Confidence) + logging.Info("=== DIAGNOSIS COMPLETE ===") + logging.Info("Root Cause: %s", resolutionResp.RootCause) + logging.Info("Resolution Plan: %s", resolutionResp.ResolutionPlan) + logging.Info("Confidence: %s", resolutionResp.Confidence) finalAIContent = aiContent break } @@ -703,7 +705,7 @@ func (c *WebSocketClient) handlePendingInvestigation(investigation PendingInvest } if err := json.Unmarshal([]byte(aiContent), &diagnosticResp); err == nil && diagnosticResp.ResponseType == "diagnostic" { - fmt.Printf("๐Ÿ”„ AI requested additional diagnostics, executing...\n") + logging.Debug("AI requested additional diagnostics, executing...") // Execute additional commands if any additionalResults := map[string]interface{}{ @@ -711,7 +713,7 @@ func (c *WebSocketClient) handlePendingInvestigation(investigation PendingInvest } if len(diagnosticResp.Commands) > 0 { - fmt.Printf("๐Ÿ”ง Executing %d additional diagnostic commands...\n", len(diagnosticResp.Commands)) + logging.Debug("Executing %d additional diagnostic commands", len(diagnosticResp.Commands)) commandResults := c.executeCommandsFromPayload(diagnosticResp.Commands) additionalResults["command_results"] = commandResults } @@ -738,7 +740,7 @@ func (c *WebSocketClient) handlePendingInvestigation(investigation PendingInvest } // If neither resolution nor diagnostic, treat as final response - fmt.Printf("โš ๏ธ Unknown response type - treating as final response\n") + logging.Warning("Unknown response type - treating as final response") finalAIContent = aiContent break } @@ -814,21 +816,21 @@ func (c *WebSocketClient) attemptReconnection() { // Only show messages after 5 consecutive failures if c.consecutiveFailures >= 5 { - log.Printf("๐Ÿ”„ Attempting WebSocket reconnection (attempt %d/%d) - %d consecutive failures", i+1, len(backoffDurations), c.consecutiveFailures) + logging.Info("Attempting WebSocket reconnection (attempt %d/%d) - %d consecutive failures", i+1, len(backoffDurations), c.consecutiveFailures) } time.Sleep(backoff) if err := c.connect(); err != nil { if c.consecutiveFailures >= 5 { - log.Printf("โŒ Reconnection attempt %d failed: %v", i+1, err) + logging.Warning("Reconnection attempt %d failed: %v", i+1, err) } continue } // Successfully reconnected - reset failure counter if c.consecutiveFailures >= 5 { - log.Printf("โœ… WebSocket reconnected successfully after %d failures", c.consecutiveFailures) + logging.Info("WebSocket reconnected successfully after %d failures", c.consecutiveFailures) } c.consecutiveFailures = 0 go c.handleMessages() // Restart message handling @@ -836,5 +838,5 @@ func (c *WebSocketClient) attemptReconnection() { } } - log.Printf("โŒ Failed to reconnect after %d attempts, giving up", len(backoffDurations)) + logging.Error("Failed to reconnect after %d attempts, giving up", len(backoffDurations)) }