Agent and websocket investigations work fine

2025-10-27 19:13:39 +01:00
parent 0a8b2dc202
commit 8832450a1f
8 changed files with 1694 additions and 19 deletions
--- a/agent.go
+++ b/agent.go
@@ -102,7 +102,7 @@ func (a *LinuxDiagnosticAgent) DiagnoseIssue(issue string) error {

 	for {
 		// Send request to TensorZero API via OpenAI SDK
-		response, err := a.sendRequest(messages)
+		response, err := a.sendRequestWithEpisode(messages, a.episodeID)
 		if err != nil {
 			return fmt.Errorf("failed to send request: %w", err)
 		}
@@ -115,34 +115,73 @@ func (a *LinuxDiagnosticAgent) DiagnoseIssue(issue string) error {
 		fmt.Printf("\nAI Response:\n%s\n", content)

 		// Parse the response to determine next action
-		var diagnosticResp DiagnosticResponse
+		var diagnosticResp EBPFEnhancedDiagnosticResponse
 		var resolutionResp ResolutionResponse

-		// Try to parse as diagnostic response first
+		// Try to parse as diagnostic response first (with eBPF support)
 		if err := json.Unmarshal([]byte(content), &diagnosticResp); err == nil && diagnosticResp.ResponseType == "diagnostic" {
 			// Handle diagnostic phase
 			fmt.Printf("\nReasoning: %s\n", diagnosticResp.Reasoning)

-			if len(diagnosticResp.Commands) == 0 {
-				fmt.Println("No commands to execute in diagnostic phase")
-				break
-			}
-
 			// Execute commands and collect results
 			commandResults := make([]CommandResult, 0, len(diagnosticResp.Commands))
-			for _, cmd := range diagnosticResp.Commands {
-				fmt.Printf("\nExecuting command '%s': %s\n", cmd.ID, cmd.Command)
-				result := a.executor.Execute(cmd)
-				commandResults = append(commandResults, result)
+			if len(diagnosticResp.Commands) > 0 {
+				fmt.Printf("🔧 Executing diagnostic commands...\n")
+				for _, cmd := range diagnosticResp.Commands {
+					fmt.Printf("⚙️  Executing command '%s': %s\n", cmd.ID, cmd.Command)
+					result := a.executor.Execute(cmd)
+					commandResults = append(commandResults, result)

-				fmt.Printf("Output:\n%s\n", result.Output)
-				if result.Error != "" {
-					fmt.Printf("Error: %s\n", result.Error)
+					if result.ExitCode == 0 {
+						fmt.Printf("✅ Command '%s' completed successfully\n", cmd.ID)
+					} else {
+						fmt.Printf("❌ Command '%s' failed with exit code %d\n", cmd.ID, result.ExitCode)
+					}
 				}
 			}

-			// Prepare command results as user message
-			resultsJSON, err := json.MarshalIndent(commandResults, "", "  ")
+			// Execute eBPF programs if present
+			var ebpfResults []map[string]interface{}
+			if len(diagnosticResp.EBPFPrograms) > 0 {
+				fmt.Printf("🔬 Executing %d eBPF programs...\n", len(diagnosticResp.EBPFPrograms))
+				ebpfResults = a.executeEBPFPrograms(diagnosticResp.EBPFPrograms)
+			}
+
+			// Prepare combined results as user message
+			allResults := map[string]interface{}{
+				"command_results":   commandResults,
+				"executed_commands": len(commandResults),
+			}
+
+			// Include eBPF results if any were executed
+			if len(ebpfResults) > 0 {
+				allResults["ebpf_results"] = ebpfResults
+				allResults["executed_ebpf_programs"] = len(ebpfResults)
+
+				// Extract evidence summary for TensorZero
+				evidenceSummary := make([]string, 0)
+				for _, result := range ebpfResults {
+					name := result["name"]
+					eventCount := result["data_points"]
+					description := result["description"]
+					status := result["status"]
+
+					summaryStr := fmt.Sprintf("%s: %v events (%s) - %s", name, eventCount, status, description)
+					evidenceSummary = append(evidenceSummary, summaryStr)
+				}
+				allResults["ebpf_evidence_summary"] = evidenceSummary
+
+				fmt.Printf("<22> Sending eBPF monitoring data to TensorZero:\n")
+				for _, summary := range evidenceSummary {
+					fmt.Printf("   - %s\n", summary)
+				}
+
+				fmt.Printf("✅ Executed %d commands, %d eBPF programs\n", len(commandResults), len(ebpfResults))
+			} else {
+				fmt.Printf("✅ Executed %d commands\n", len(commandResults))
+			}
+
+			resultsJSON, err := json.MarshalIndent(allResults, "", "  ")
 			if err != nil {
 				return fmt.Errorf("failed to marshal command results: %w", err)
 			}
@@ -178,6 +217,127 @@ func (a *LinuxDiagnosticAgent) DiagnoseIssue(issue string) error {
 	return nil
 }

+// executeEBPFPrograms executes REAL eBPF monitoring programs using the actual eBPF manager
+func (a *LinuxDiagnosticAgent) executeEBPFPrograms(ebpfPrograms []EBPFRequest) []map[string]interface{} {
+	var results []map[string]interface{}
+
+	if a.ebpfManager == nil {
+		fmt.Printf("❌ eBPF manager not initialized\n")
+		return results
+	}
+
+	for _, prog := range ebpfPrograms {
+		fmt.Printf("🔬 Starting eBPF program [%s]: %s -> %s (%ds)\n", prog.Name, prog.Type, prog.Target, int(prog.Duration))
+
+		// Actually start the eBPF program using the real manager
+		programID, err := a.ebpfManager.StartEBPFProgram(prog)
+		if err != nil {
+			fmt.Printf("❌ Failed to start eBPF program [%s]: %v\n", prog.Name, err)
+			result := map[string]interface{}{
+				"name":        prog.Name,
+				"type":        prog.Type,
+				"target":      prog.Target,
+				"duration":    int(prog.Duration),
+				"description": prog.Description,
+				"status":      "failed",
+				"error":       err.Error(),
+				"success":     false,
+			}
+			results = append(results, result)
+			continue
+		}
+
+		// Let the eBPF program run for the specified duration
+		fmt.Printf("⏰ Waiting %d seconds for eBPF program to collect data...\n", int(prog.Duration))
+		time.Sleep(time.Duration(prog.Duration) * time.Second)
+
+		// Give the collectEvents goroutine a moment to finish and store results
+		fmt.Printf("⏳ Allowing program to complete data collection...\n")
+		time.Sleep(500 * time.Millisecond)
+
+		// Get the results (should be in completedResults now)
+		fmt.Printf("📊 Getting results for eBPF program [%s]...\n", prog.Name)
+
+		// Use a channel to implement timeout for GetProgramResults
+		type resultPair struct {
+			trace *EBPFTrace
+			err   error
+		}
+		resultChan := make(chan resultPair, 1)
+
+		go func() {
+			trace, err := a.ebpfManager.GetProgramResults(programID)
+			resultChan <- resultPair{trace, err}
+		}()
+
+		var trace *EBPFTrace
+		var resultErr error
+
+		select {
+		case result := <-resultChan:
+			trace = result.trace
+			resultErr = result.err
+		case <-time.After(3 * time.Second):
+			resultErr = fmt.Errorf("timeout getting results after 3 seconds")
+		}
+
+		// Try to stop the program (may already be stopped by collectEvents)
+		fmt.Printf("🛑 Stopping eBPF program [%s]...\n", prog.Name)
+		stopErr := a.ebpfManager.StopProgram(programID)
+		if stopErr != nil {
+			fmt.Printf("⚠️  eBPF program [%s] cleanup: %v (may have already completed)\n", prog.Name, stopErr)
+			// Don't return here, we still want to process results if we got them
+		}
+
+		if resultErr != nil {
+			fmt.Printf("❌ Failed to get results for eBPF program [%s]: %v\n", prog.Name, resultErr)
+			result := map[string]interface{}{
+				"name":        prog.Name,
+				"type":        prog.Type,
+				"target":      prog.Target,
+				"duration":    int(prog.Duration),
+				"description": prog.Description,
+				"status":      "collection_failed",
+				"error":       resultErr.Error(),
+				"success":     false,
+			}
+			results = append(results, result)
+			continue
+		} // Process the real eBPF trace data
+		result := map[string]interface{}{
+			"name":        prog.Name,
+			"type":        prog.Type,
+			"target":      prog.Target,
+			"duration":    int(prog.Duration),
+			"description": prog.Description,
+			"status":      "completed",
+			"success":     true,
+		}
+
+		// Extract real data from the trace
+		if trace != nil {
+			result["trace_id"] = trace.TraceID
+			result["data_points"] = trace.EventCount
+			result["events"] = trace.Events
+			result["summary"] = trace.Summary
+			result["process_list"] = trace.ProcessList
+			result["start_time"] = trace.StartTime.Format(time.RFC3339)
+			result["end_time"] = trace.EndTime.Format(time.RFC3339)
+			result["actual_duration"] = trace.EndTime.Sub(trace.StartTime).Seconds()
+
+			fmt.Printf("✅ eBPF program [%s] completed - collected %d real events\n", prog.Name, trace.EventCount)
+		} else {
+			result["data_points"] = 0
+			result["error"] = "No trace data returned"
+			fmt.Printf("⚠️  eBPF program [%s] completed but returned no trace data\n", prog.Name)
+		}
+
+		results = append(results, result)
+	}
+
+	return results
+}
+
 // TensorZeroRequest represents a request structure compatible with TensorZero's episode_id
 type TensorZeroRequest struct {
 	Model     string                         `json:"model"`
@@ -193,6 +353,11 @@ type TensorZeroResponse struct {

 // sendRequest sends a request to the TensorZero API via Supabase proxy with JWT authentication
 func (a *LinuxDiagnosticAgent) sendRequest(messages []openai.ChatCompletionMessage) (*openai.ChatCompletionResponse, error) {
+	return a.sendRequestWithEpisode(messages, "")
+}
+
+// sendRequestWithEpisode sends a request with a specific episode ID
+func (a *LinuxDiagnosticAgent) sendRequestWithEpisode(messages []openai.ChatCompletionMessage, episodeID string) (*openai.ChatCompletionResponse, error) {
 	ctx, cancel := context.WithTimeout(context.Background(), 30*time.Second)
 	defer cancel()

@@ -202,9 +367,12 @@ func (a *LinuxDiagnosticAgent) sendRequest(messages []openai.ChatCompletionMessa
 		Messages: messages,
 	}

-	// Include tensorzero::episode_id for conversation continuity (if we have one)
+	// Include tensorzero::episode_id for conversation continuity
+	// Use agent's existing episode ID if available, otherwise use provided one
 	if a.episodeID != "" {
 		tzRequest.EpisodeID = a.episodeID
+	} else if episodeID != "" {
+		tzRequest.EpisodeID = episodeID
 	}

 	fmt.Printf("Debug: Sending request to model: %s", a.model)