somewhat working ebpf bpftrace
This commit is contained in:
145
agent.go
145
agent.go
@@ -11,6 +11,9 @@ import (
|
||||
"sync"
|
||||
"time"
|
||||
|
||||
"nannyagentv2/internal/logging"
|
||||
"nannyagentv2/internal/types"
|
||||
|
||||
"github.com/sashabaranov/go-openai"
|
||||
)
|
||||
|
||||
@@ -25,28 +28,6 @@ type DiagnosticResponse struct {
|
||||
ConfidenceLevel float64 `json:"confidence_level"`
|
||||
}
|
||||
|
||||
// EBPFRequest represents a request for eBPF program execution
|
||||
type EBPFRequest struct {
|
||||
Name string `json:"name"`
|
||||
Type string `json:"type"`
|
||||
Target string `json:"target"`
|
||||
Duration int `json:"duration"`
|
||||
Filters map[string]string `json:"filters,omitempty"`
|
||||
Description string `json:"description"`
|
||||
}
|
||||
|
||||
// EBPFEnhancedDiagnosticResponse represents the enhanced diagnostic response with eBPF
|
||||
type EBPFEnhancedDiagnosticResponse struct {
|
||||
ResponseType string `json:"response_type"`
|
||||
Phase string `json:"phase"`
|
||||
Analysis string `json:"analysis"`
|
||||
Commands []string `json:"commands"`
|
||||
EBPFPrograms []EBPFRequest `json:"ebpf_programs"`
|
||||
NextSteps []string `json:"next_steps"`
|
||||
Reasoning string `json:"reasoning"`
|
||||
ConfidenceLevel float64 `json:"confidence_level"`
|
||||
}
|
||||
|
||||
// ResolutionResponse represents the resolution phase response from AI
|
||||
type ResolutionResponse struct {
|
||||
ResponseType string `json:"response_type"`
|
||||
@@ -93,6 +74,7 @@ type LinuxDiagnosticAgent struct {
|
||||
episodeID string // TensorZero episode ID for conversation continuity
|
||||
ebpfManager *BCCTraceManager // BCC-style eBPF tracing capabilities
|
||||
config *AgentConfig // Configuration for concurrent execution
|
||||
authManager interface{} // Authentication manager for TensorZero requests
|
||||
}
|
||||
|
||||
// NewLinuxDiagnosticAgent creates a new diagnostic agent
|
||||
@@ -100,14 +82,14 @@ func NewLinuxDiagnosticAgent() *LinuxDiagnosticAgent {
|
||||
// Get Supabase project URL for TensorZero proxy
|
||||
supabaseURL := os.Getenv("SUPABASE_PROJECT_URL")
|
||||
if supabaseURL == "" {
|
||||
fmt.Printf("Warning: SUPABASE_PROJECT_URL not set, TensorZero integration will not work\n")
|
||||
logging.Warning("SUPABASE_PROJECT_URL not set, TensorZero integration will not work")
|
||||
supabaseURL = "https://gpqzsricripnvbrpsyws.supabase.co" // fallback
|
||||
}
|
||||
|
||||
model := os.Getenv("NANNYAPI_MODEL")
|
||||
if model == "" {
|
||||
model = "tensorzero::function_name::diagnose_and_heal"
|
||||
fmt.Printf("Warning: Using default model '%s'. Set NANNYAPI_MODEL environment variable for your specific function.\n", model)
|
||||
logging.Warning("Using default model '%s'. Set NANNYAPI_MODEL environment variable for your specific function", model)
|
||||
}
|
||||
|
||||
// Note: We don't use the OpenAI client anymore, we use direct HTTP to Supabase proxy
|
||||
@@ -124,10 +106,40 @@ func NewLinuxDiagnosticAgent() *LinuxDiagnosticAgent {
|
||||
return agent
|
||||
}
|
||||
|
||||
// NewLinuxDiagnosticAgentWithAuth creates a new diagnostic agent with authentication
|
||||
func NewLinuxDiagnosticAgentWithAuth(authManager interface{}) *LinuxDiagnosticAgent {
|
||||
// Get Supabase project URL for TensorZero proxy
|
||||
supabaseURL := os.Getenv("SUPABASE_PROJECT_URL")
|
||||
if supabaseURL == "" {
|
||||
logging.Warning("SUPABASE_PROJECT_URL not set, TensorZero integration will not work")
|
||||
supabaseURL = "https://gpqzsricripnvbrpsyws.supabase.co" // fallback
|
||||
}
|
||||
|
||||
model := os.Getenv("NANNYAPI_MODEL")
|
||||
if model == "" {
|
||||
model = "tensorzero::function_name::diagnose_and_heal"
|
||||
logging.Warning("Using default model '%s'. Set NANNYAPI_MODEL environment variable for your specific function", model)
|
||||
}
|
||||
|
||||
// Note: We don't use the OpenAI client anymore, we use direct HTTP to Supabase proxy
|
||||
agent := &LinuxDiagnosticAgent{
|
||||
client: nil, // Not used anymore
|
||||
model: model,
|
||||
executor: NewCommandExecutor(10 * time.Second), // 10 second timeout for commands
|
||||
config: DefaultAgentConfig(), // Default concurrent execution config
|
||||
authManager: authManager, // Store auth manager for TensorZero requests
|
||||
}
|
||||
|
||||
// Initialize BCC-style eBPF capabilities
|
||||
agent.ebpfManager = NewBCCTraceManager()
|
||||
|
||||
return agent
|
||||
}
|
||||
|
||||
// DiagnoseIssue starts the diagnostic process for a given issue
|
||||
func (a *LinuxDiagnosticAgent) DiagnoseIssue(issue string) error {
|
||||
fmt.Printf("Diagnosing issue: %s\n", issue)
|
||||
fmt.Println("Gathering system information...")
|
||||
logging.Info("Diagnosing issue: %s", issue)
|
||||
logging.Info("Gathering system information...")
|
||||
|
||||
// Gather system information
|
||||
systemInfo := GatherSystemInfo()
|
||||
@@ -155,25 +167,27 @@ func (a *LinuxDiagnosticAgent) DiagnoseIssue(issue string) error {
|
||||
}
|
||||
|
||||
content := response.Choices[0].Message.Content
|
||||
fmt.Printf("\nAI Response:\n%s\n", content)
|
||||
logging.Debug("AI Response: %s", content)
|
||||
|
||||
// Parse the response to determine next action
|
||||
var diagnosticResp EBPFEnhancedDiagnosticResponse
|
||||
var diagnosticResp types.EBPFEnhancedDiagnosticResponse
|
||||
var resolutionResp ResolutionResponse
|
||||
|
||||
// Try to parse as diagnostic response first (with eBPF support)
|
||||
logging.Debug("Attempting to parse response as diagnostic...")
|
||||
if err := json.Unmarshal([]byte(content), &diagnosticResp); err == nil && diagnosticResp.ResponseType == "diagnostic" {
|
||||
logging.Debug("Successfully parsed as diagnostic response with %d commands", len(diagnosticResp.Commands))
|
||||
// Handle diagnostic phase
|
||||
fmt.Printf("\nReasoning: %s\n", diagnosticResp.Reasoning)
|
||||
logging.Debug("Reasoning: %s", diagnosticResp.Reasoning)
|
||||
|
||||
// Execute commands and collect results
|
||||
commandResults := make([]CommandResult, 0, len(diagnosticResp.Commands))
|
||||
if len(diagnosticResp.Commands) > 0 {
|
||||
fmt.Printf("🔧 Executing diagnostic commands...\n")
|
||||
logging.Info("Executing %d diagnostic commands", len(diagnosticResp.Commands))
|
||||
for i, cmdStr := range diagnosticResp.Commands {
|
||||
// Convert string to Command struct
|
||||
// Convert string command to Command struct (auto-generate ID and description)
|
||||
cmd := Command{
|
||||
ID: fmt.Sprintf("cmd_%d", i),
|
||||
ID: fmt.Sprintf("cmd_%d", i+1),
|
||||
Command: cmdStr,
|
||||
Description: fmt.Sprintf("Diagnostic command: %s", cmdStr),
|
||||
}
|
||||
@@ -181,7 +195,7 @@ func (a *LinuxDiagnosticAgent) DiagnoseIssue(issue string) error {
|
||||
commandResults = append(commandResults, result)
|
||||
|
||||
if result.ExitCode != 0 {
|
||||
fmt.Printf("❌ Command '%s' failed with exit code %d\n", cmd.ID, result.ExitCode)
|
||||
logging.Warning("Command '%s' failed with exit code %d", cmd.ID, result.ExitCode)
|
||||
}
|
||||
}
|
||||
}
|
||||
@@ -189,7 +203,7 @@ func (a *LinuxDiagnosticAgent) DiagnoseIssue(issue string) error {
|
||||
// Execute eBPF programs if present - support both old and new formats
|
||||
var ebpfResults []map[string]interface{}
|
||||
if len(diagnosticResp.EBPFPrograms) > 0 {
|
||||
fmt.Printf("🔬 AI requested %d eBPF traces for enhanced diagnostics\n", len(diagnosticResp.EBPFPrograms))
|
||||
logging.Info("AI requested %d eBPF traces for enhanced diagnostics", len(diagnosticResp.EBPFPrograms))
|
||||
|
||||
// Convert EBPFPrograms to TraceSpecs and execute concurrently
|
||||
traceSpecs := a.convertEBPFProgramsToTraceSpecs(diagnosticResp.EBPFPrograms)
|
||||
@@ -210,12 +224,17 @@ func (a *LinuxDiagnosticAgent) DiagnoseIssue(issue string) error {
|
||||
// Extract evidence summary for TensorZero
|
||||
evidenceSummary := make([]string, 0)
|
||||
for _, result := range ebpfResults {
|
||||
name := result["name"]
|
||||
eventCount := result["data_points"]
|
||||
description := result["description"]
|
||||
status := result["status"]
|
||||
target := result["target"]
|
||||
eventCount := result["event_count"]
|
||||
summary := result["summary"]
|
||||
success := result["success"]
|
||||
|
||||
summaryStr := fmt.Sprintf("%s: %v events (%s) - %s", name, eventCount, status, description)
|
||||
status := "failed"
|
||||
if success == true {
|
||||
status = "success"
|
||||
}
|
||||
|
||||
summaryStr := fmt.Sprintf("%s: %v events (%s) - %s", target, eventCount, status, summary)
|
||||
evidenceSummary = append(evidenceSummary, summaryStr)
|
||||
}
|
||||
allResults["ebpf_evidence_summary"] = evidenceSummary
|
||||
@@ -237,20 +256,22 @@ func (a *LinuxDiagnosticAgent) DiagnoseIssue(issue string) error {
|
||||
})
|
||||
|
||||
continue
|
||||
} else {
|
||||
logging.Debug("Failed to parse as diagnostic. Error: %v, ResponseType: '%s'", err, diagnosticResp.ResponseType)
|
||||
}
|
||||
|
||||
// Try to parse as resolution response
|
||||
if err := json.Unmarshal([]byte(content), &resolutionResp); err == nil && resolutionResp.ResponseType == "resolution" {
|
||||
// Handle resolution phase
|
||||
fmt.Printf("\n=== DIAGNOSIS COMPLETE ===\n")
|
||||
fmt.Printf("Root Cause: %s\n", resolutionResp.RootCause)
|
||||
fmt.Printf("Resolution Plan: %s\n", resolutionResp.ResolutionPlan)
|
||||
fmt.Printf("Confidence: %s\n", resolutionResp.Confidence)
|
||||
logging.Info("=== DIAGNOSIS COMPLETE ===")
|
||||
logging.Info("Root Cause: %s", resolutionResp.RootCause)
|
||||
logging.Info("Resolution Plan: %s", resolutionResp.ResolutionPlan)
|
||||
logging.Info("Confidence: %s", resolutionResp.Confidence)
|
||||
break
|
||||
}
|
||||
|
||||
// If we can't parse the response, treat it as an error or unexpected format
|
||||
fmt.Printf("Unexpected response format or error from AI:\n%s\n", content)
|
||||
logging.Error("Unexpected response format or error from AI: %s", content)
|
||||
break
|
||||
}
|
||||
|
||||
@@ -296,8 +317,9 @@ func (a *LinuxDiagnosticAgent) sendRequestWithEpisode(messages []openai.ChatComp
|
||||
return nil, fmt.Errorf("SUPABASE_PROJECT_URL not set")
|
||||
}
|
||||
|
||||
// Create HTTP request to TensorZero proxy
|
||||
endpoint := fmt.Sprintf("%s/functions/v1/tensorzero-proxy", supabaseURL)
|
||||
// Create HTTP request to TensorZero proxy (includes OpenAI-compatible path)
|
||||
endpoint := fmt.Sprintf("%s/functions/v1/tensorzero-proxy/openai/v1/chat/completions", supabaseURL)
|
||||
logging.Debug("Calling TensorZero proxy at: %s", endpoint)
|
||||
req, err := http.NewRequest("POST", endpoint, bytes.NewBuffer(requestBody))
|
||||
if err != nil {
|
||||
return nil, fmt.Errorf("failed to create request: %w", err)
|
||||
@@ -307,7 +329,17 @@ func (a *LinuxDiagnosticAgent) sendRequestWithEpisode(messages []openai.ChatComp
|
||||
req.Header.Set("Content-Type", "application/json")
|
||||
req.Header.Set("Accept", "application/json")
|
||||
|
||||
// Note: No authentication needed for TensorZero proxy based on the existing pattern
|
||||
// Add authentication if auth manager is available (same pattern as investigation_server.go)
|
||||
if a.authManager != nil {
|
||||
// The authManager should be *auth.AuthManager, so let's use the exact same pattern
|
||||
if authMgr, ok := a.authManager.(interface {
|
||||
LoadToken() (*types.AuthToken, error)
|
||||
}); ok {
|
||||
if authToken, err := authMgr.LoadToken(); err == nil && authToken != nil {
|
||||
req.Header.Set("Authorization", fmt.Sprintf("Bearer %s", authToken.AccessToken))
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
// Send request
|
||||
client := &http.Client{Timeout: 30 * time.Second}
|
||||
@@ -372,7 +404,7 @@ func (a *LinuxDiagnosticAgent) sendRequestWithEpisode(messages []openai.ChatComp
|
||||
}
|
||||
|
||||
// convertEBPFProgramsToTraceSpecs converts old EBPFProgram format to new TraceSpec format
|
||||
func (a *LinuxDiagnosticAgent) convertEBPFProgramsToTraceSpecs(ebpfPrograms []EBPFRequest) []TraceSpec {
|
||||
func (a *LinuxDiagnosticAgent) convertEBPFProgramsToTraceSpecs(ebpfPrograms []types.EBPFRequest) []TraceSpec {
|
||||
var traceSpecs []TraceSpec
|
||||
|
||||
for _, prog := range ebpfPrograms {
|
||||
@@ -384,7 +416,7 @@ func (a *LinuxDiagnosticAgent) convertEBPFProgramsToTraceSpecs(ebpfPrograms []EB
|
||||
}
|
||||
|
||||
// convertToTraceSpec converts an EBPFRequest to a TraceSpec for BCC-style tracing
|
||||
func (a *LinuxDiagnosticAgent) convertToTraceSpec(prog EBPFRequest) TraceSpec {
|
||||
func (a *LinuxDiagnosticAgent) convertToTraceSpec(prog types.EBPFRequest) TraceSpec {
|
||||
// Determine probe type based on target and type
|
||||
probeType := "p" // default to kprobe
|
||||
target := prog.Target
|
||||
@@ -421,6 +453,7 @@ func (a *LinuxDiagnosticAgent) convertToTraceSpec(prog EBPFRequest) TraceSpec {
|
||||
Format: prog.Description, // Use description as format
|
||||
Arguments: []string{}, // Start with no arguments for compatibility
|
||||
Duration: duration,
|
||||
UID: -1, // No UID filter (don't default to 0 which means root only)
|
||||
}
|
||||
}
|
||||
|
||||
@@ -430,7 +463,7 @@ func (a *LinuxDiagnosticAgent) executeBCCTracesConcurrently(traceSpecs []TraceSp
|
||||
return []map[string]interface{}{}
|
||||
}
|
||||
|
||||
fmt.Printf("🚀 Executing %d BCC traces with max %d concurrent tasks\n", len(traceSpecs), a.config.MaxConcurrentTasks)
|
||||
logging.Info("Executing %d BCC traces with max %d concurrent tasks", len(traceSpecs), a.config.MaxConcurrentTasks)
|
||||
|
||||
// Channel to limit concurrent goroutines
|
||||
semaphore := make(chan struct{}, a.config.MaxConcurrentTasks)
|
||||
@@ -465,7 +498,7 @@ func (a *LinuxDiagnosticAgent) executeBCCTracesConcurrently(traceSpecs []TraceSp
|
||||
}
|
||||
|
||||
if a.config.CollectiveResults {
|
||||
fmt.Printf("✅ All %d BCC traces completed. Sending collective results to API layer.\n", len(allResults))
|
||||
logging.Debug("All %d BCC traces completed. Sending collective results to API layer", len(allResults))
|
||||
}
|
||||
|
||||
return allResults
|
||||
@@ -482,18 +515,18 @@ func (a *LinuxDiagnosticAgent) executeSingleBCCTrace(index int, spec TraceSpec)
|
||||
"start_time": time.Now().Format(time.RFC3339),
|
||||
}
|
||||
|
||||
fmt.Printf("🔍 [Task %d] Starting BCC trace: %s (type: %s)\n", index, spec.Target, spec.ProbeType)
|
||||
logging.Debug("[Task %d] Starting BCC trace: %s (type: %s)", index, spec.Target, spec.ProbeType)
|
||||
|
||||
// Start the trace
|
||||
traceID, err := a.ebpfManager.StartTrace(spec)
|
||||
if err != nil {
|
||||
result["error"] = fmt.Sprintf("Failed to start trace: %v", err)
|
||||
fmt.Printf("❌ [Task %d] Failed to start trace %s: %v\n", index, spec.Target, err)
|
||||
logging.Error("[Task %d] Failed to start trace %s: %v", index, spec.Target, err)
|
||||
return result
|
||||
}
|
||||
|
||||
result["trace_id"] = traceID
|
||||
fmt.Printf("🚀 [Task %d] Trace %s started with ID: %s\n", index, spec.Target, traceID)
|
||||
logging.Debug("[Task %d] Trace %s started with ID: %s", index, spec.Target, traceID)
|
||||
|
||||
// Wait for the trace duration
|
||||
time.Sleep(time.Duration(spec.Duration) * time.Second)
|
||||
@@ -504,7 +537,7 @@ func (a *LinuxDiagnosticAgent) executeSingleBCCTrace(index int, spec TraceSpec)
|
||||
// Try to stop the trace if it's still running
|
||||
a.ebpfManager.StopTrace(traceID)
|
||||
result["error"] = fmt.Sprintf("Failed to get trace results: %v", err)
|
||||
fmt.Printf("❌ [Task %d] Failed to get results for trace %s: %v\n", index, spec.Target, err)
|
||||
logging.Error("[Task %d] Failed to get results for trace %s: %v", index, spec.Target, err)
|
||||
return result
|
||||
}
|
||||
|
||||
@@ -551,7 +584,7 @@ func (a *LinuxDiagnosticAgent) executeSingleBCCTrace(index int, spec TraceSpec)
|
||||
result["top_processes"] = topProcesses
|
||||
}
|
||||
|
||||
fmt.Printf("✅ [Task %d] Trace %s completed: %d events (%.2f events/sec)\n",
|
||||
logging.Debug("[Task %d] Trace %s completed: %d events (%.2f events/sec)",
|
||||
index, spec.Target, traceResult.EventCount, traceResult.Statistics.EventsPerSecond)
|
||||
|
||||
return result
|
||||
|
||||
Reference in New Issue
Block a user