somewhat working ebpf bpftrace

2025-11-08 20:42:07 +01:00
parent 190e54dd38
commit 794111cb44
16 changed files with 2834 additions and 216 deletions
--- a/agent.go
+++ b/agent.go
@@ -11,6 +11,9 @@ import (
 	"sync"
 	"time"

+	"nannyagentv2/internal/logging"
+	"nannyagentv2/internal/types"
+
 	"github.com/sashabaranov/go-openai"
 )

@@ -25,28 +28,6 @@ type DiagnosticResponse struct {
 	ConfidenceLevel float64  `json:"confidence_level"`
 }

-// EBPFRequest represents a request for eBPF program execution
-type EBPFRequest struct {
-	Name        string            `json:"name"`
-	Type        string            `json:"type"`
-	Target      string            `json:"target"`
-	Duration    int               `json:"duration"`
-	Filters     map[string]string `json:"filters,omitempty"`
-	Description string            `json:"description"`
-}
-
-// EBPFEnhancedDiagnosticResponse represents the enhanced diagnostic response with eBPF
-type EBPFEnhancedDiagnosticResponse struct {
-	ResponseType    string        `json:"response_type"`
-	Phase           string        `json:"phase"`
-	Analysis        string        `json:"analysis"`
-	Commands        []string      `json:"commands"`
-	EBPFPrograms    []EBPFRequest `json:"ebpf_programs"`
-	NextSteps       []string      `json:"next_steps"`
-	Reasoning       string        `json:"reasoning"`
-	ConfidenceLevel float64       `json:"confidence_level"`
-}
-
 // ResolutionResponse represents the resolution phase response from AI
 type ResolutionResponse struct {
 	ResponseType   string `json:"response_type"`
@@ -93,6 +74,7 @@ type LinuxDiagnosticAgent struct {
 	episodeID   string           // TensorZero episode ID for conversation continuity
 	ebpfManager *BCCTraceManager // BCC-style eBPF tracing capabilities
 	config      *AgentConfig     // Configuration for concurrent execution
+	authManager interface{}      // Authentication manager for TensorZero requests
 }

 // NewLinuxDiagnosticAgent creates a new diagnostic agent
@@ -100,14 +82,14 @@ func NewLinuxDiagnosticAgent() *LinuxDiagnosticAgent {
 	// Get Supabase project URL for TensorZero proxy
 	supabaseURL := os.Getenv("SUPABASE_PROJECT_URL")
 	if supabaseURL == "" {
-		fmt.Printf("Warning: SUPABASE_PROJECT_URL not set, TensorZero integration will not work\n")
+		logging.Warning("SUPABASE_PROJECT_URL not set, TensorZero integration will not work")
 		supabaseURL = "https://gpqzsricripnvbrpsyws.supabase.co" // fallback
 	}

 	model := os.Getenv("NANNYAPI_MODEL")
 	if model == "" {
 		model = "tensorzero::function_name::diagnose_and_heal"
-		fmt.Printf("Warning: Using default model '%s'. Set NANNYAPI_MODEL environment variable for your specific function.\n", model)
+		logging.Warning("Using default model '%s'. Set NANNYAPI_MODEL environment variable for your specific function", model)
 	}

 	// Note: We don't use the OpenAI client anymore, we use direct HTTP to Supabase proxy
@@ -124,10 +106,40 @@ func NewLinuxDiagnosticAgent() *LinuxDiagnosticAgent {
 	return agent
 }

+// NewLinuxDiagnosticAgentWithAuth creates a new diagnostic agent with authentication
+func NewLinuxDiagnosticAgentWithAuth(authManager interface{}) *LinuxDiagnosticAgent {
+	// Get Supabase project URL for TensorZero proxy
+	supabaseURL := os.Getenv("SUPABASE_PROJECT_URL")
+	if supabaseURL == "" {
+		logging.Warning("SUPABASE_PROJECT_URL not set, TensorZero integration will not work")
+		supabaseURL = "https://gpqzsricripnvbrpsyws.supabase.co" // fallback
+	}
+
+	model := os.Getenv("NANNYAPI_MODEL")
+	if model == "" {
+		model = "tensorzero::function_name::diagnose_and_heal"
+		logging.Warning("Using default model '%s'. Set NANNYAPI_MODEL environment variable for your specific function", model)
+	}
+
+	// Note: We don't use the OpenAI client anymore, we use direct HTTP to Supabase proxy
+	agent := &LinuxDiagnosticAgent{
+		client:      nil, // Not used anymore
+		model:       model,
+		executor:    NewCommandExecutor(10 * time.Second), // 10 second timeout for commands
+		config:      DefaultAgentConfig(),                 // Default concurrent execution config
+		authManager: authManager,                          // Store auth manager for TensorZero requests
+	}
+
+	// Initialize BCC-style eBPF capabilities
+	agent.ebpfManager = NewBCCTraceManager()
+
+	return agent
+}
+
 // DiagnoseIssue starts the diagnostic process for a given issue
 func (a *LinuxDiagnosticAgent) DiagnoseIssue(issue string) error {
-	fmt.Printf("Diagnosing issue: %s\n", issue)
-	fmt.Println("Gathering system information...")
+	logging.Info("Diagnosing issue: %s", issue)
+	logging.Info("Gathering system information...")

 	// Gather system information
 	systemInfo := GatherSystemInfo()
@@ -155,25 +167,27 @@ func (a *LinuxDiagnosticAgent) DiagnoseIssue(issue string) error {
 		}

 		content := response.Choices[0].Message.Content
-		fmt.Printf("\nAI Response:\n%s\n", content)
+		logging.Debug("AI Response: %s", content)

 		// Parse the response to determine next action
-		var diagnosticResp EBPFEnhancedDiagnosticResponse
+		var diagnosticResp types.EBPFEnhancedDiagnosticResponse
 		var resolutionResp ResolutionResponse

 		// Try to parse as diagnostic response first (with eBPF support)
+		logging.Debug("Attempting to parse response as diagnostic...")
 		if err := json.Unmarshal([]byte(content), &diagnosticResp); err == nil && diagnosticResp.ResponseType == "diagnostic" {
+			logging.Debug("Successfully parsed as diagnostic response with %d commands", len(diagnosticResp.Commands))
 			// Handle diagnostic phase
-			fmt.Printf("\nReasoning: %s\n", diagnosticResp.Reasoning)
+			logging.Debug("Reasoning: %s", diagnosticResp.Reasoning)

 			// Execute commands and collect results
 			commandResults := make([]CommandResult, 0, len(diagnosticResp.Commands))
 			if len(diagnosticResp.Commands) > 0 {
-				fmt.Printf("🔧 Executing diagnostic commands...\n")
+				logging.Info("Executing %d diagnostic commands", len(diagnosticResp.Commands))
 				for i, cmdStr := range diagnosticResp.Commands {
-					// Convert string to Command struct
+					// Convert string command to Command struct (auto-generate ID and description)
 					cmd := Command{
-						ID:          fmt.Sprintf("cmd_%d", i),
+						ID:          fmt.Sprintf("cmd_%d", i+1),
 						Command:     cmdStr,
 						Description: fmt.Sprintf("Diagnostic command: %s", cmdStr),
 					}
@@ -181,7 +195,7 @@ func (a *LinuxDiagnosticAgent) DiagnoseIssue(issue string) error {
 					commandResults = append(commandResults, result)

 					if result.ExitCode != 0 {
-						fmt.Printf("❌ Command '%s' failed with exit code %d\n", cmd.ID, result.ExitCode)
+						logging.Warning("Command '%s' failed with exit code %d", cmd.ID, result.ExitCode)
 					}
 				}
 			}
@@ -189,7 +203,7 @@ func (a *LinuxDiagnosticAgent) DiagnoseIssue(issue string) error {
 			// Execute eBPF programs if present - support both old and new formats
 			var ebpfResults []map[string]interface{}
 			if len(diagnosticResp.EBPFPrograms) > 0 {
-				fmt.Printf("🔬 AI requested %d eBPF traces for enhanced diagnostics\n", len(diagnosticResp.EBPFPrograms))
+				logging.Info("AI requested %d eBPF traces for enhanced diagnostics", len(diagnosticResp.EBPFPrograms))

 				// Convert EBPFPrograms to TraceSpecs and execute concurrently
 				traceSpecs := a.convertEBPFProgramsToTraceSpecs(diagnosticResp.EBPFPrograms)
@@ -210,12 +224,17 @@ func (a *LinuxDiagnosticAgent) DiagnoseIssue(issue string) error {
 				// Extract evidence summary for TensorZero
 				evidenceSummary := make([]string, 0)
 				for _, result := range ebpfResults {
-					name := result["name"]
-					eventCount := result["data_points"]
-					description := result["description"]
-					status := result["status"]
+					target := result["target"]
+					eventCount := result["event_count"]
+					summary := result["summary"]
+					success := result["success"]

-					summaryStr := fmt.Sprintf("%s: %v events (%s) - %s", name, eventCount, status, description)
+					status := "failed"
+					if success == true {
+						status = "success"
+					}
+
+					summaryStr := fmt.Sprintf("%s: %v events (%s) - %s", target, eventCount, status, summary)
 					evidenceSummary = append(evidenceSummary, summaryStr)
 				}
 				allResults["ebpf_evidence_summary"] = evidenceSummary
@@ -237,20 +256,22 @@ func (a *LinuxDiagnosticAgent) DiagnoseIssue(issue string) error {
 			})

 			continue
+		} else {
+			logging.Debug("Failed to parse as diagnostic. Error: %v, ResponseType: '%s'", err, diagnosticResp.ResponseType)
 		}

 		// Try to parse as resolution response
 		if err := json.Unmarshal([]byte(content), &resolutionResp); err == nil && resolutionResp.ResponseType == "resolution" {
 			// Handle resolution phase
-			fmt.Printf("\n=== DIAGNOSIS COMPLETE ===\n")
-			fmt.Printf("Root Cause: %s\n", resolutionResp.RootCause)
-			fmt.Printf("Resolution Plan: %s\n", resolutionResp.ResolutionPlan)
-			fmt.Printf("Confidence: %s\n", resolutionResp.Confidence)
+			logging.Info("=== DIAGNOSIS COMPLETE ===")
+			logging.Info("Root Cause: %s", resolutionResp.RootCause)
+			logging.Info("Resolution Plan: %s", resolutionResp.ResolutionPlan)
+			logging.Info("Confidence: %s", resolutionResp.Confidence)
 			break
 		}

 		// If we can't parse the response, treat it as an error or unexpected format
-		fmt.Printf("Unexpected response format or error from AI:\n%s\n", content)
+		logging.Error("Unexpected response format or error from AI: %s", content)
 		break
 	}

@@ -296,8 +317,9 @@ func (a *LinuxDiagnosticAgent) sendRequestWithEpisode(messages []openai.ChatComp
 		return nil, fmt.Errorf("SUPABASE_PROJECT_URL not set")
 	}

-	// Create HTTP request to TensorZero proxy
-	endpoint := fmt.Sprintf("%s/functions/v1/tensorzero-proxy", supabaseURL)
+	// Create HTTP request to TensorZero proxy (includes OpenAI-compatible path)
+	endpoint := fmt.Sprintf("%s/functions/v1/tensorzero-proxy/openai/v1/chat/completions", supabaseURL)
+	logging.Debug("Calling TensorZero proxy at: %s", endpoint)
 	req, err := http.NewRequest("POST", endpoint, bytes.NewBuffer(requestBody))
 	if err != nil {
 		return nil, fmt.Errorf("failed to create request: %w", err)
@@ -307,7 +329,17 @@ func (a *LinuxDiagnosticAgent) sendRequestWithEpisode(messages []openai.ChatComp
 	req.Header.Set("Content-Type", "application/json")
 	req.Header.Set("Accept", "application/json")

-	// Note: No authentication needed for TensorZero proxy based on the existing pattern
+	// Add authentication if auth manager is available (same pattern as investigation_server.go)
+	if a.authManager != nil {
+		// The authManager should be *auth.AuthManager, so let's use the exact same pattern
+		if authMgr, ok := a.authManager.(interface {
+			LoadToken() (*types.AuthToken, error)
+		}); ok {
+			if authToken, err := authMgr.LoadToken(); err == nil && authToken != nil {
+				req.Header.Set("Authorization", fmt.Sprintf("Bearer %s", authToken.AccessToken))
+			}
+		}
+	}

 	// Send request
 	client := &http.Client{Timeout: 30 * time.Second}
@@ -372,7 +404,7 @@ func (a *LinuxDiagnosticAgent) sendRequestWithEpisode(messages []openai.ChatComp
 }

 // convertEBPFProgramsToTraceSpecs converts old EBPFProgram format to new TraceSpec format
-func (a *LinuxDiagnosticAgent) convertEBPFProgramsToTraceSpecs(ebpfPrograms []EBPFRequest) []TraceSpec {
+func (a *LinuxDiagnosticAgent) convertEBPFProgramsToTraceSpecs(ebpfPrograms []types.EBPFRequest) []TraceSpec {
 	var traceSpecs []TraceSpec

 	for _, prog := range ebpfPrograms {
@@ -384,7 +416,7 @@ func (a *LinuxDiagnosticAgent) convertEBPFProgramsToTraceSpecs(ebpfPrograms []EB
 }

 // convertToTraceSpec converts an EBPFRequest to a TraceSpec for BCC-style tracing
-func (a *LinuxDiagnosticAgent) convertToTraceSpec(prog EBPFRequest) TraceSpec {
+func (a *LinuxDiagnosticAgent) convertToTraceSpec(prog types.EBPFRequest) TraceSpec {
 	// Determine probe type based on target and type
 	probeType := "p" // default to kprobe
 	target := prog.Target
@@ -421,6 +453,7 @@ func (a *LinuxDiagnosticAgent) convertToTraceSpec(prog EBPFRequest) TraceSpec {
 		Format:    prog.Description, // Use description as format
 		Arguments: []string{},       // Start with no arguments for compatibility
 		Duration:  duration,
+		UID:       -1, // No UID filter (don't default to 0 which means root only)
 	}
 }

@@ -430,7 +463,7 @@ func (a *LinuxDiagnosticAgent) executeBCCTracesConcurrently(traceSpecs []TraceSp
 		return []map[string]interface{}{}
 	}

-	fmt.Printf("🚀 Executing %d BCC traces with max %d concurrent tasks\n", len(traceSpecs), a.config.MaxConcurrentTasks)
+	logging.Info("Executing %d BCC traces with max %d concurrent tasks", len(traceSpecs), a.config.MaxConcurrentTasks)

 	// Channel to limit concurrent goroutines
 	semaphore := make(chan struct{}, a.config.MaxConcurrentTasks)
@@ -465,7 +498,7 @@ func (a *LinuxDiagnosticAgent) executeBCCTracesConcurrently(traceSpecs []TraceSp
 	}

 	if a.config.CollectiveResults {
-		fmt.Printf("✅ All %d BCC traces completed. Sending collective results to API layer.\n", len(allResults))
+		logging.Debug("All %d BCC traces completed. Sending collective results to API layer", len(allResults))
 	}

 	return allResults
@@ -482,18 +515,18 @@ func (a *LinuxDiagnosticAgent) executeSingleBCCTrace(index int, spec TraceSpec)
 		"start_time": time.Now().Format(time.RFC3339),
 	}

-	fmt.Printf("🔍 [Task %d] Starting BCC trace: %s (type: %s)\n", index, spec.Target, spec.ProbeType)
+	logging.Debug("[Task %d] Starting BCC trace: %s (type: %s)", index, spec.Target, spec.ProbeType)

 	// Start the trace
 	traceID, err := a.ebpfManager.StartTrace(spec)
 	if err != nil {
 		result["error"] = fmt.Sprintf("Failed to start trace: %v", err)
-		fmt.Printf("❌ [Task %d] Failed to start trace %s: %v\n", index, spec.Target, err)
+		logging.Error("[Task %d] Failed to start trace %s: %v", index, spec.Target, err)
 		return result
 	}

 	result["trace_id"] = traceID
-	fmt.Printf("🚀 [Task %d] Trace %s started with ID: %s\n", index, spec.Target, traceID)
+	logging.Debug("[Task %d] Trace %s started with ID: %s", index, spec.Target, traceID)

 	// Wait for the trace duration
 	time.Sleep(time.Duration(spec.Duration) * time.Second)
@@ -504,7 +537,7 @@ func (a *LinuxDiagnosticAgent) executeSingleBCCTrace(index int, spec TraceSpec)
 		// Try to stop the trace if it's still running
 		a.ebpfManager.StopTrace(traceID)
 		result["error"] = fmt.Sprintf("Failed to get trace results: %v", err)
-		fmt.Printf("❌ [Task %d] Failed to get results for trace %s: %v\n", index, spec.Target, err)
+		logging.Error("[Task %d] Failed to get results for trace %s: %v", index, spec.Target, err)
 		return result
 	}

@@ -551,7 +584,7 @@ func (a *LinuxDiagnosticAgent) executeSingleBCCTrace(index int, spec TraceSpec)
 		result["top_processes"] = topProcesses
 	}

-	fmt.Printf("✅ [Task %d] Trace %s completed: %d events (%.2f events/sec)\n",
+	logging.Debug("[Task %d] Trace %s completed: %d events (%.2f events/sec)",
 		index, spec.Target, traceResult.EventCount, traceResult.Statistics.EventsPerSecond)

 	return result