Remove old eBPF implementations - keep only new BCC-style concurrent tracing

2025-11-08 14:56:56 +01:00
parent 8328f8d5b3
commit 190e54dd38
10 changed files with 326 additions and 1613 deletions
--- a/agent.go
+++ b/agent.go
@@ -2,12 +2,13 @@ package main

 import (
 	"bytes"
-	"context"
 	"encoding/json"
 	"fmt"
 	"io"
 	"net/http"
 	"os"
+	"strings"
+	"sync"
 	"time"

 	"github.com/sashabaranov/go-openai"
@@ -15,9 +16,35 @@ import (

 // DiagnosticResponse represents the diagnostic phase response from AI
 type DiagnosticResponse struct {
-	ResponseType string    `json:"response_type"`
-	Reasoning    string    `json:"reasoning"`
-	Commands     []Command `json:"commands"`
+	ResponseType    string   `json:"response_type"`
+	Phase           string   `json:"phase"`
+	Analysis        string   `json:"analysis"`
+	Commands        []string `json:"commands"`
+	NextSteps       []string `json:"next_steps"`
+	Reasoning       string   `json:"reasoning"`
+	ConfidenceLevel float64  `json:"confidence_level"`
+}
+
+// EBPFRequest represents a request for eBPF program execution
+type EBPFRequest struct {
+	Name        string            `json:"name"`
+	Type        string            `json:"type"`
+	Target      string            `json:"target"`
+	Duration    int               `json:"duration"`
+	Filters     map[string]string `json:"filters,omitempty"`
+	Description string            `json:"description"`
+}
+
+// EBPFEnhancedDiagnosticResponse represents the enhanced diagnostic response with eBPF
+type EBPFEnhancedDiagnosticResponse struct {
+	ResponseType    string        `json:"response_type"`
+	Phase           string        `json:"phase"`
+	Analysis        string        `json:"analysis"`
+	Commands        []string      `json:"commands"`
+	EBPFPrograms    []EBPFRequest `json:"ebpf_programs"`
+	NextSteps       []string      `json:"next_steps"`
+	Reasoning       string        `json:"reasoning"`
+	ConfidenceLevel float64       `json:"confidence_level"`
 }

 // ResolutionResponse represents the resolution phase response from AI
@@ -35,6 +62,20 @@ type Command struct {
 	Description string `json:"description"`
 }

+// AgentConfig holds configuration for concurrent execution
+type AgentConfig struct {
+	MaxConcurrentTasks int  `json:"max_concurrent_tasks"`
+	CollectiveResults  bool `json:"collective_results"`
+}
+
+// DefaultAgentConfig returns default configuration
+func DefaultAgentConfig() *AgentConfig {
+	return &AgentConfig{
+		MaxConcurrentTasks: 10,   // Default to 10 concurrent forks
+		CollectiveResults:  true, // Send results collectively when all finish
+	}
+}
+
 // CommandResult represents the result of executing a command
 type CommandResult struct {
 	ID       string `json:"id"`
@@ -49,8 +90,9 @@ type LinuxDiagnosticAgent struct {
 	client      *openai.Client
 	model       string
 	executor    *CommandExecutor
-	episodeID   string               // TensorZero episode ID for conversation continuity
-	ebpfManager EBPFManagerInterface // eBPF monitoring capabilities
+	episodeID   string           // TensorZero episode ID for conversation continuity
+	ebpfManager *BCCTraceManager // BCC-style eBPF tracing capabilities
+	config      *AgentConfig     // Configuration for concurrent execution
 }

 // NewLinuxDiagnosticAgent creates a new diagnostic agent
@@ -73,10 +115,11 @@ func NewLinuxDiagnosticAgent() *LinuxDiagnosticAgent {
 		client:   nil, // Not used anymore
 		model:    model,
 		executor: NewCommandExecutor(10 * time.Second), // 10 second timeout for commands
+		config:   DefaultAgentConfig(),                 // Default concurrent execution config
 	}

-	// Initialize eBPF capabilities
-	agent.ebpfManager = NewCiliumEBPFManager()
+	// Initialize BCC-style eBPF capabilities
+	agent.ebpfManager = NewBCCTraceManager()

 	return agent
 }
@@ -127,7 +170,13 @@ func (a *LinuxDiagnosticAgent) DiagnoseIssue(issue string) error {
 			commandResults := make([]CommandResult, 0, len(diagnosticResp.Commands))
 			if len(diagnosticResp.Commands) > 0 {
 				fmt.Printf("🔧 Executing diagnostic commands...\n")
-				for _, cmd := range diagnosticResp.Commands {
+				for i, cmdStr := range diagnosticResp.Commands {
+					// Convert string to Command struct
+					cmd := Command{
+						ID:          fmt.Sprintf("cmd_%d", i),
+						Command:     cmdStr,
+						Description: fmt.Sprintf("Diagnostic command: %s", cmdStr),
+					}
 					result := a.executor.Execute(cmd)
 					commandResults = append(commandResults, result)

@@ -137,10 +186,14 @@ func (a *LinuxDiagnosticAgent) DiagnoseIssue(issue string) error {
 				}
 			}

-			// Execute eBPF programs if present
+			// Execute eBPF programs if present - support both old and new formats
 			var ebpfResults []map[string]interface{}
 			if len(diagnosticResp.EBPFPrograms) > 0 {
-				ebpfResults = a.executeEBPFPrograms(diagnosticResp.EBPFPrograms)
+				fmt.Printf("🔬 AI requested %d eBPF traces for enhanced diagnostics\n", len(diagnosticResp.EBPFPrograms))
+
+				// Convert EBPFPrograms to TraceSpecs and execute concurrently
+				traceSpecs := a.convertEBPFProgramsToTraceSpecs(diagnosticResp.EBPFPrograms)
+				ebpfResults = a.executeBCCTracesConcurrently(traceSpecs)
 			}

 			// Prepare combined results as user message
@@ -204,193 +257,59 @@ func (a *LinuxDiagnosticAgent) DiagnoseIssue(issue string) error {
 	return nil
 }

-// executeEBPFPrograms executes REAL eBPF monitoring programs using the actual eBPF manager
-func (a *LinuxDiagnosticAgent) executeEBPFPrograms(ebpfPrograms []EBPFRequest) []map[string]interface{} {
-	var results []map[string]interface{}
-
-	if a.ebpfManager == nil {
-		fmt.Printf("❌ eBPF manager not initialized\n")
-		return results
-	}
-
-	for _, prog := range ebpfPrograms {
-		// eBPF program starting - only show in debug mode
-
-		// Actually start the eBPF program using the real manager
-		programID, err := a.ebpfManager.StartEBPFProgram(prog)
-		if err != nil {
-			fmt.Printf("❌ Failed to start eBPF program [%s]: %v\n", prog.Name, err)
-			result := map[string]interface{}{
-				"name":        prog.Name,
-				"type":        prog.Type,
-				"target":      prog.Target,
-				"duration":    int(prog.Duration),
-				"description": prog.Description,
-				"status":      "failed",
-				"error":       err.Error(),
-				"success":     false,
-			}
-			results = append(results, result)
-			continue
-		}
-
-		// Let the eBPF program run for the specified duration
-		time.Sleep(time.Duration(prog.Duration) * time.Second)
-
-		// Give the collectEvents goroutine a moment to finish and store results
-		time.Sleep(500 * time.Millisecond)
-
-		// Use a channel to implement timeout for GetProgramResults
-		type resultPair struct {
-			trace *EBPFTrace
-			err   error
-		}
-		resultChan := make(chan resultPair, 1)
-
-		go func() {
-			trace, err := a.ebpfManager.GetProgramResults(programID)
-			resultChan <- resultPair{trace, err}
-		}()
-
-		var trace *EBPFTrace
-		var resultErr error
-
-		select {
-		case result := <-resultChan:
-			trace = result.trace
-			resultErr = result.err
-		case <-time.After(3 * time.Second):
-			resultErr = fmt.Errorf("timeout getting results after 3 seconds")
-		}
-
-		// Try to stop the program (may already be stopped by collectEvents)
-		stopErr := a.ebpfManager.StopProgram(programID)
-		if stopErr != nil {
-			// Only show warning in debug mode - this is normal for completed programs
-		}
-
-		if resultErr != nil {
-			fmt.Printf("❌ Failed to get results for eBPF program [%s]: %v\n", prog.Name, resultErr)
-			result := map[string]interface{}{
-				"name":        prog.Name,
-				"type":        prog.Type,
-				"target":      prog.Target,
-				"duration":    int(prog.Duration),
-				"description": prog.Description,
-				"status":      "collection_failed",
-				"error":       resultErr.Error(),
-				"success":     false,
-			}
-			results = append(results, result)
-			continue
-		} // Process the real eBPF trace data
-		result := map[string]interface{}{
-			"name":        prog.Name,
-			"type":        prog.Type,
-			"target":      prog.Target,
-			"duration":    int(prog.Duration),
-			"description": prog.Description,
-			"status":      "completed",
-			"success":     true,
-		}
-
-		// Extract real data from the trace
-		if trace != nil {
-			result["trace_id"] = trace.TraceID
-			result["data_points"] = trace.EventCount
-			result["events"] = trace.Events
-			result["summary"] = trace.Summary
-			result["process_list"] = trace.ProcessList
-			result["start_time"] = trace.StartTime.Format(time.RFC3339)
-			result["end_time"] = trace.EndTime.Format(time.RFC3339)
-			result["actual_duration"] = trace.EndTime.Sub(trace.StartTime).Seconds()
-
-		} else {
-			result["data_points"] = 0
-			result["error"] = "No trace data returned"
-			fmt.Printf("⚠️  eBPF program [%s] completed but returned no trace data\n", prog.Name)
-		}
-
-		results = append(results, result)
-	}
-
-	return results
-}
-
-// TensorZeroRequest represents a request structure compatible with TensorZero's episode_id
-type TensorZeroRequest struct {
-	Model     string                         `json:"model"`
-	Messages  []openai.ChatCompletionMessage `json:"messages"`
-	EpisodeID string                         `json:"tensorzero::episode_id,omitempty"`
-}
-
-// TensorZeroResponse represents TensorZero's response with episode_id
-type TensorZeroResponse struct {
-	openai.ChatCompletionResponse
-	EpisodeID string `json:"episode_id"`
-}
-
-// sendRequest sends a request to the TensorZero API via Supabase proxy with JWT authentication
+// sendRequest sends a request to TensorZero via Supabase proxy (without episode ID)
 func (a *LinuxDiagnosticAgent) sendRequest(messages []openai.ChatCompletionMessage) (*openai.ChatCompletionResponse, error) {
 	return a.sendRequestWithEpisode(messages, "")
 }

-// sendRequestWithEpisode sends a request with a specific episode ID
+// sendRequestWithEpisode sends a request to TensorZero via Supabase proxy with episode ID for conversation continuity
 func (a *LinuxDiagnosticAgent) sendRequestWithEpisode(messages []openai.ChatCompletionMessage, episodeID string) (*openai.ChatCompletionResponse, error) {
-	ctx, cancel := context.WithTimeout(context.Background(), 30*time.Second)
-	defer cancel()
-
-	// Create TensorZero-compatible request
-	tzRequest := TensorZeroRequest{
-		Model:    a.model,
-		Messages: messages,
+	// Convert messages to the expected format
+	messageMaps := make([]map[string]interface{}, len(messages))
+	for i, msg := range messages {
+		messageMaps[i] = map[string]interface{}{
+			"role":    msg.Role,
+			"content": msg.Content,
+		}
 	}

-	// Include tensorzero::episode_id for conversation continuity
-	// Use agent's existing episode ID if available, otherwise use provided one
-	if a.episodeID != "" {
-		tzRequest.EpisodeID = a.episodeID
-	} else if episodeID != "" {
-		tzRequest.EpisodeID = episodeID
+	// Create TensorZero request
+	tzRequest := map[string]interface{}{
+		"model":    a.model,
+		"messages": messageMaps,
 	}

-	fmt.Printf("Debug: Sending request to model: %s", a.model)
-	if a.episodeID != "" {
-		fmt.Printf(" (episode: %s)", a.episodeID)
+	// Add episode ID if provided
+	if episodeID != "" {
+		tzRequest["tensorzero::episode_id"] = episodeID
 	}
-	fmt.Println()

-	// Marshal the request
+	// Marshal request
 	requestBody, err := json.Marshal(tzRequest)
 	if err != nil {
 		return nil, fmt.Errorf("failed to marshal request: %w", err)
 	}

-	// Get Supabase project URL and build TensorZero proxy endpoint
+	// Get Supabase URL
 	supabaseURL := os.Getenv("SUPABASE_PROJECT_URL")
 	if supabaseURL == "" {
-		supabaseURL = "https://gpqzsricripnvbrpsyws.supabase.co"
+		return nil, fmt.Errorf("SUPABASE_PROJECT_URL not set")
 	}

-	// Build Supabase function URL with OpenAI v1 compatible path
-	endpoint := supabaseURL + "/functions/v1/tensorzero-proxy/openai/v1/chat/completions"
-
-	req, err := http.NewRequestWithContext(ctx, "POST", endpoint, bytes.NewBuffer(requestBody))
+	// Create HTTP request to TensorZero proxy
+	endpoint := fmt.Sprintf("%s/functions/v1/tensorzero-proxy", supabaseURL)
+	req, err := http.NewRequest("POST", endpoint, bytes.NewBuffer(requestBody))
 	if err != nil {
 		return nil, fmt.Errorf("failed to create request: %w", err)
 	}

+	// Set headers
 	req.Header.Set("Content-Type", "application/json")
+	req.Header.Set("Accept", "application/json")

-	// Add JWT authentication header
-	accessToken, err := a.getAccessToken()
-	if err != nil {
-		return nil, fmt.Errorf("failed to get access token: %w", err)
-	}
+	// Note: No authentication needed for TensorZero proxy based on the existing pattern

-	req.Header.Set("Authorization", "Bearer "+accessToken)
-
-	// Make the request
+	// Send request
 	client := &http.Client{Timeout: 30 * time.Second}
 	resp, err := client.Do(req)
 	if err != nil {
@@ -398,55 +317,242 @@ func (a *LinuxDiagnosticAgent) sendRequestWithEpisode(messages []openai.ChatComp
 	}
 	defer resp.Body.Close()

-	// Read response body
-	body, err := io.ReadAll(resp.Body)
-	if err != nil {
-		return nil, fmt.Errorf("failed to read response: %w", err)
+	// Check status code
+	if resp.StatusCode != 200 {
+		body, _ := io.ReadAll(resp.Body)
+		return nil, fmt.Errorf("TensorZero proxy error: %d, body: %s", resp.StatusCode, string(body))
 	}

-	if resp.StatusCode != http.StatusOK {
-		return nil, fmt.Errorf("TensorZero API request failed with status %d: %s", resp.StatusCode, string(body))
+	// Parse response
+	var tzResponse map[string]interface{}
+	if err := json.NewDecoder(resp.Body).Decode(&tzResponse); err != nil {
+		return nil, fmt.Errorf("failed to decode response: %w", err)
 	}

-	// Parse TensorZero response
-	var tzResponse TensorZeroResponse
-	if err := json.Unmarshal(body, &tzResponse); err != nil {
-		return nil, fmt.Errorf("failed to unmarshal response: %w", err)
+	// Convert to OpenAI format for compatibility
+	choices, ok := tzResponse["choices"].([]interface{})
+	if !ok || len(choices) == 0 {
+		return nil, fmt.Errorf("no choices in response")
 	}

-	// Extract episode_id from first response
-	if a.episodeID == "" && tzResponse.EpisodeID != "" {
-		a.episodeID = tzResponse.EpisodeID
-		fmt.Printf("Debug: Extracted episode ID: %s\n", a.episodeID)
+	// Extract the first choice
+	firstChoice, ok := choices[0].(map[string]interface{})
+	if !ok {
+		return nil, fmt.Errorf("invalid choice format")
 	}

-	return &tzResponse.ChatCompletionResponse, nil
+	message, ok := firstChoice["message"].(map[string]interface{})
+	if !ok {
+		return nil, fmt.Errorf("invalid message format")
+	}
+
+	content, ok := message["content"].(string)
+	if !ok {
+		return nil, fmt.Errorf("invalid content format")
+	}
+
+	// Create OpenAI-compatible response
+	response := &openai.ChatCompletionResponse{
+		Choices: []openai.ChatCompletionChoice{
+			{
+				Message: openai.ChatCompletionMessage{
+					Role:    openai.ChatMessageRoleAssistant,
+					Content: content,
+				},
+			},
+		},
+	}
+
+	// Update episode ID if provided in response
+	if respEpisodeID, ok := tzResponse["episode_id"].(string); ok && respEpisodeID != "" {
+		a.episodeID = respEpisodeID
+	}
+
+	return response, nil
 }

-// getAccessToken retrieves the current access token for authentication
-func (a *LinuxDiagnosticAgent) getAccessToken() (string, error) {
-	// Read token from the standard token file location
-	tokenPath := os.Getenv("TOKEN_PATH")
-	if tokenPath == "" {
-		tokenPath = "/var/lib/nannyagent/token.json"
+// convertEBPFProgramsToTraceSpecs converts old EBPFProgram format to new TraceSpec format
+func (a *LinuxDiagnosticAgent) convertEBPFProgramsToTraceSpecs(ebpfPrograms []EBPFRequest) []TraceSpec {
+	var traceSpecs []TraceSpec
+
+	for _, prog := range ebpfPrograms {
+		spec := a.convertToTraceSpec(prog)
+		traceSpecs = append(traceSpecs, spec)
 	}

-	tokenData, err := os.ReadFile(tokenPath)
-	if err != nil {
-		return "", fmt.Errorf("failed to read token file: %w", err)
-	}
-
-	var tokenInfo struct {
-		AccessToken string `json:"access_token"`
-	}
-
-	if err := json.Unmarshal(tokenData, &tokenInfo); err != nil {
-		return "", fmt.Errorf("failed to parse token file: %w", err)
-	}
-
-	if tokenInfo.AccessToken == "" {
-		return "", fmt.Errorf("access token is empty")
-	}
-
-	return tokenInfo.AccessToken, nil
+	return traceSpecs
+}
+
+// convertToTraceSpec converts an EBPFRequest to a TraceSpec for BCC-style tracing
+func (a *LinuxDiagnosticAgent) convertToTraceSpec(prog EBPFRequest) TraceSpec {
+	// Determine probe type based on target and type
+	probeType := "p" // default to kprobe
+	target := prog.Target
+
+	if strings.HasPrefix(target, "tracepoint:") {
+		probeType = "t"
+		target = strings.TrimPrefix(target, "tracepoint:")
+	} else if strings.HasPrefix(target, "kprobe:") {
+		probeType = "p"
+		target = strings.TrimPrefix(target, "kprobe:")
+	} else if prog.Type == "tracepoint" {
+		probeType = "t"
+	} else if prog.Type == "syscall" {
+		// Convert syscall names to kprobe targets
+		if !strings.HasPrefix(target, "__x64_sys_") && !strings.Contains(target, ":") {
+			if strings.HasPrefix(target, "sys_") {
+				target = "__x64_" + target
+			} else {
+				target = "__x64_sys_" + target
+			}
+		}
+		probeType = "p"
+	}
+
+	// Set default duration if not specified
+	duration := prog.Duration
+	if duration <= 0 {
+		duration = 5 // default 5 seconds
+	}
+
+	return TraceSpec{
+		ProbeType: probeType,
+		Target:    target,
+		Format:    prog.Description, // Use description as format
+		Arguments: []string{},       // Start with no arguments for compatibility
+		Duration:  duration,
+	}
+}
+
+// executeBCCTracesConcurrently executes multiple BCC traces concurrently with configurable parallelism
+func (a *LinuxDiagnosticAgent) executeBCCTracesConcurrently(traceSpecs []TraceSpec) []map[string]interface{} {
+	if len(traceSpecs) == 0 {
+		return []map[string]interface{}{}
+	}
+
+	fmt.Printf("🚀 Executing %d BCC traces with max %d concurrent tasks\n", len(traceSpecs), a.config.MaxConcurrentTasks)
+
+	// Channel to limit concurrent goroutines
+	semaphore := make(chan struct{}, a.config.MaxConcurrentTasks)
+	resultsChan := make(chan map[string]interface{}, len(traceSpecs))
+	var wg sync.WaitGroup
+
+	// Start all traces concurrently
+	for i, spec := range traceSpecs {
+		wg.Add(1)
+		go func(index int, traceSpec TraceSpec) {
+			defer wg.Done()
+
+			// Acquire semaphore
+			semaphore <- struct{}{}
+			defer func() { <-semaphore }()
+
+			result := a.executeSingleBCCTrace(index, traceSpec)
+			resultsChan <- result
+		}(i, spec)
+	}
+
+	// Wait for all traces to complete
+	go func() {
+		wg.Wait()
+		close(resultsChan)
+	}()
+
+	// Collect all results
+	var allResults []map[string]interface{}
+	for result := range resultsChan {
+		allResults = append(allResults, result)
+	}
+
+	if a.config.CollectiveResults {
+		fmt.Printf("✅ All %d BCC traces completed. Sending collective results to API layer.\n", len(allResults))
+	}
+
+	return allResults
+}
+
+// executeSingleBCCTrace executes a single BCC trace and returns the result
+func (a *LinuxDiagnosticAgent) executeSingleBCCTrace(index int, spec TraceSpec) map[string]interface{} {
+	result := map[string]interface{}{
+		"index":      index,
+		"target":     spec.Target,
+		"probe_type": spec.ProbeType,
+		"success":    false,
+		"error":      "",
+		"start_time": time.Now().Format(time.RFC3339),
+	}
+
+	fmt.Printf("🔍 [Task %d] Starting BCC trace: %s (type: %s)\n", index, spec.Target, spec.ProbeType)
+
+	// Start the trace
+	traceID, err := a.ebpfManager.StartTrace(spec)
+	if err != nil {
+		result["error"] = fmt.Sprintf("Failed to start trace: %v", err)
+		fmt.Printf("❌ [Task %d] Failed to start trace %s: %v\n", index, spec.Target, err)
+		return result
+	}
+
+	result["trace_id"] = traceID
+	fmt.Printf("🚀 [Task %d] Trace %s started with ID: %s\n", index, spec.Target, traceID)
+
+	// Wait for the trace duration
+	time.Sleep(time.Duration(spec.Duration) * time.Second)
+
+	// Get the trace result
+	traceResult, err := a.ebpfManager.GetTraceResult(traceID)
+	if err != nil {
+		// Try to stop the trace if it's still running
+		a.ebpfManager.StopTrace(traceID)
+		result["error"] = fmt.Sprintf("Failed to get trace results: %v", err)
+		fmt.Printf("❌ [Task %d] Failed to get results for trace %s: %v\n", index, spec.Target, err)
+		return result
+	}
+
+	// Populate result with trace data
+	result["success"] = true
+	result["end_time"] = time.Now().Format(time.RFC3339)
+	result["event_count"] = traceResult.EventCount
+	result["events_per_second"] = traceResult.Statistics.EventsPerSecond
+	result["duration"] = traceResult.EndTime.Sub(traceResult.StartTime).Seconds()
+	result["summary"] = traceResult.Summary
+
+	// Include sample events (limit to avoid large payloads)
+	maxSampleEvents := 10
+	if len(traceResult.Events) > 0 {
+		sampleCount := len(traceResult.Events)
+		if sampleCount > maxSampleEvents {
+			sampleCount = maxSampleEvents
+		}
+
+		sampleEvents := make([]map[string]interface{}, sampleCount)
+		for i := 0; i < sampleCount; i++ {
+			event := traceResult.Events[i]
+			sampleEvents[i] = map[string]interface{}{
+				"pid":          event.PID,
+				"tid":          event.TID,
+				"process_name": event.ProcessName,
+				"message":      event.Message,
+				"timestamp":    event.Timestamp,
+			}
+		}
+		result["sample_events"] = sampleEvents
+	}
+
+	// Include top processes
+	if len(traceResult.Statistics.TopProcesses) > 0 {
+		topProcesses := make([]map[string]interface{}, len(traceResult.Statistics.TopProcesses))
+		for i, proc := range traceResult.Statistics.TopProcesses {
+			topProcesses[i] = map[string]interface{}{
+				"process_name": proc.ProcessName,
+				"event_count":  proc.EventCount,
+				"percentage":   proc.Percentage,
+			}
+		}
+		result["top_processes"] = topProcesses
+	}
+
+	fmt.Printf("✅ [Task %d] Trace %s completed: %d events (%.2f events/sec)\n",
+		index, spec.Target, traceResult.EventCount, traceResult.Statistics.EventsPerSecond)
+
+	return result
 }