501 lines
16 KiB
Go
501 lines
16 KiB
Go
package main
|
|
|
|
import (
|
|
"bytes"
|
|
"encoding/json"
|
|
"fmt"
|
|
"io"
|
|
"net/http"
|
|
"os"
|
|
"strings"
|
|
"time"
|
|
|
|
"nannyagentv2/internal/ebpf"
|
|
"nannyagentv2/internal/executor"
|
|
"nannyagentv2/internal/logging"
|
|
"nannyagentv2/internal/system"
|
|
"nannyagentv2/internal/types"
|
|
|
|
"github.com/sashabaranov/go-openai"
|
|
)
|
|
|
|
// AgentConfig holds configuration for concurrent execution (local to agent)
|
|
type AgentConfig struct {
|
|
MaxConcurrentTasks int `json:"max_concurrent_tasks"`
|
|
CollectiveResults bool `json:"collective_results"`
|
|
}
|
|
|
|
// DefaultAgentConfig returns default configuration
|
|
func DefaultAgentConfig() *AgentConfig {
|
|
return &AgentConfig{
|
|
MaxConcurrentTasks: 10, // Default to 10 concurrent forks
|
|
CollectiveResults: true, // Send results collectively when all finish
|
|
}
|
|
}
|
|
|
|
//
|
|
// LinuxDiagnosticAgent represents the main diagnostic agent
|
|
|
|
// LinuxDiagnosticAgent represents the main diagnostic agent
|
|
type LinuxDiagnosticAgent struct {
|
|
client *openai.Client
|
|
model string
|
|
executor *executor.CommandExecutor
|
|
episodeID string // TensorZero episode ID for conversation continuity
|
|
ebpfManager *ebpf.BCCTraceManager // eBPF tracing manager
|
|
config *AgentConfig // Configuration for concurrent execution
|
|
authManager interface{} // Authentication manager for TensorZero requests
|
|
logger *logging.Logger
|
|
}
|
|
|
|
// NewLinuxDiagnosticAgent creates a new diagnostic agent
|
|
func NewLinuxDiagnosticAgent() *LinuxDiagnosticAgent {
|
|
// Get Supabase project URL for TensorZero proxy
|
|
supabaseURL := os.Getenv("SUPABASE_PROJECT_URL")
|
|
if supabaseURL == "" {
|
|
logging.Warning("SUPABASE_PROJECT_URL not set, TensorZero integration will not work")
|
|
supabaseURL = "https://gpqzsricripnvbrpsyws.supabase.co" // fallback
|
|
}
|
|
|
|
model := os.Getenv("NANNYAPI_MODEL")
|
|
if model == "" {
|
|
model = "tensorzero::function_name::diagnose_and_heal"
|
|
logging.Warning("Using default model '%s'. Set NANNYAPI_MODEL environment variable for your specific function", model)
|
|
}
|
|
|
|
// Note: We don't use the OpenAI client anymore, we use direct HTTP to Supabase proxy
|
|
agent := &LinuxDiagnosticAgent{
|
|
client: nil, // Not used anymore
|
|
model: model,
|
|
executor: executor.NewCommandExecutor(10 * time.Second), // 10 second timeout for commands
|
|
config: DefaultAgentConfig(), // Default concurrent execution config
|
|
}
|
|
|
|
// Initialize eBPF manager
|
|
agent.ebpfManager = ebpf.NewBCCTraceManager()
|
|
agent.logger = logging.NewLogger()
|
|
|
|
return agent
|
|
}
|
|
|
|
// NewLinuxDiagnosticAgentWithAuth creates a new diagnostic agent with authentication
|
|
func NewLinuxDiagnosticAgentWithAuth(authManager interface{}) *LinuxDiagnosticAgent {
|
|
// Get Supabase project URL for TensorZero proxy
|
|
supabaseURL := os.Getenv("SUPABASE_PROJECT_URL")
|
|
if supabaseURL == "" {
|
|
logging.Warning("SUPABASE_PROJECT_URL not set, TensorZero integration will not work")
|
|
supabaseURL = "https://gpqzsricripnvbrpsyws.supabase.co" // fallback
|
|
}
|
|
|
|
model := os.Getenv("NANNYAPI_MODEL")
|
|
if model == "" {
|
|
model = "tensorzero::function_name::diagnose_and_heal"
|
|
logging.Warning("Using default model '%s'. Set NANNYAPI_MODEL environment variable for your specific function", model)
|
|
}
|
|
|
|
// Note: We don't use the OpenAI client anymore, we use direct HTTP to Supabase proxy
|
|
agent := &LinuxDiagnosticAgent{
|
|
client: nil, // Not used anymore
|
|
model: model,
|
|
executor: executor.NewCommandExecutor(10 * time.Second), // 10 second timeout for commands
|
|
config: DefaultAgentConfig(), // Default concurrent execution config
|
|
authManager: authManager, // Store auth manager for TensorZero requests
|
|
}
|
|
|
|
// Initialize eBPF manager
|
|
agent.ebpfManager = ebpf.NewBCCTraceManager()
|
|
agent.logger = logging.NewLogger()
|
|
|
|
return agent
|
|
}
|
|
|
|
// DiagnoseIssue starts the diagnostic process for a given issue
|
|
func (a *LinuxDiagnosticAgent) DiagnoseIssue(issue string) error {
|
|
logging.Info("Diagnosing issue: %s", issue)
|
|
logging.Info("Gathering system information...")
|
|
|
|
// Gather system information
|
|
systemInfo := system.GatherSystemInfo()
|
|
|
|
// Format the initial prompt with system information
|
|
initialPrompt := system.FormatSystemInfoForPrompt(systemInfo) + "\n" + issue
|
|
|
|
// Start conversation with initial issue including system info
|
|
messages := []openai.ChatCompletionMessage{
|
|
{
|
|
Role: openai.ChatMessageRoleUser,
|
|
Content: initialPrompt,
|
|
},
|
|
}
|
|
|
|
for {
|
|
// Send request to TensorZero API via OpenAI SDK
|
|
response, err := a.SendRequestWithEpisode(messages, a.episodeID)
|
|
if err != nil {
|
|
return fmt.Errorf("failed to send request: %w", err)
|
|
}
|
|
|
|
if len(response.Choices) == 0 {
|
|
return fmt.Errorf("no choices in response")
|
|
}
|
|
|
|
content := response.Choices[0].Message.Content
|
|
logging.Debug("AI Response: %s", content)
|
|
|
|
// Parse the response to determine next action
|
|
var diagnosticResp types.EBPFEnhancedDiagnosticResponse
|
|
var resolutionResp types.ResolutionResponse
|
|
|
|
// Try to parse as diagnostic response first (with eBPF support)
|
|
logging.Debug("Attempting to parse response as diagnostic...")
|
|
if err := json.Unmarshal([]byte(content), &diagnosticResp); err == nil && diagnosticResp.ResponseType == "diagnostic" {
|
|
logging.Debug("Successfully parsed as diagnostic response with %d commands", len(diagnosticResp.Commands))
|
|
// Handle diagnostic phase
|
|
logging.Debug("Reasoning: %s", diagnosticResp.Reasoning)
|
|
|
|
// Execute commands and collect results
|
|
commandResults := make([]types.CommandResult, 0, len(diagnosticResp.Commands))
|
|
if len(diagnosticResp.Commands) > 0 {
|
|
logging.Info("Executing %d diagnostic commands", len(diagnosticResp.Commands))
|
|
for i, cmdStr := range diagnosticResp.Commands {
|
|
// Convert string command to Command struct (auto-generate ID and description)
|
|
cmd := types.Command{
|
|
ID: fmt.Sprintf("cmd_%d", i+1),
|
|
Command: cmdStr,
|
|
Description: fmt.Sprintf("Diagnostic command: %s", cmdStr),
|
|
}
|
|
result := a.executor.Execute(cmd)
|
|
commandResults = append(commandResults, result)
|
|
|
|
if result.ExitCode != 0 {
|
|
logging.Warning("Command '%s' failed with exit code %d", cmd.ID, result.ExitCode)
|
|
}
|
|
}
|
|
}
|
|
|
|
// Execute eBPF programs if present - support both old and new formats
|
|
var ebpfResults []map[string]interface{}
|
|
if len(diagnosticResp.EBPFPrograms) > 0 {
|
|
logging.Info("AI requested %d eBPF traces for enhanced diagnostics", len(diagnosticResp.EBPFPrograms))
|
|
|
|
// Convert EBPFPrograms to TraceSpecs and execute concurrently using the eBPF service
|
|
traceSpecs := a.ConvertEBPFProgramsToTraceSpecs(diagnosticResp.EBPFPrograms)
|
|
ebpfResults = a.ExecuteEBPFTraces(traceSpecs)
|
|
}
|
|
|
|
// Prepare combined results as user message
|
|
allResults := map[string]interface{}{
|
|
"command_results": commandResults,
|
|
"executed_commands": len(commandResults),
|
|
}
|
|
|
|
// Include eBPF results if any were executed
|
|
if len(ebpfResults) > 0 {
|
|
allResults["ebpf_results"] = ebpfResults
|
|
allResults["executed_ebpf_programs"] = len(ebpfResults)
|
|
|
|
// Extract evidence summary for TensorZero
|
|
evidenceSummary := make([]string, 0)
|
|
for _, result := range ebpfResults {
|
|
target := result["target"]
|
|
eventCount := result["event_count"]
|
|
summary := result["summary"]
|
|
success := result["success"]
|
|
|
|
status := "failed"
|
|
if success == true {
|
|
status = "success"
|
|
}
|
|
|
|
summaryStr := fmt.Sprintf("%s: %v events (%s) - %s", target, eventCount, status, summary)
|
|
evidenceSummary = append(evidenceSummary, summaryStr)
|
|
}
|
|
allResults["ebpf_evidence_summary"] = evidenceSummary
|
|
}
|
|
|
|
resultsJSON, err := json.MarshalIndent(allResults, "", " ")
|
|
if err != nil {
|
|
return fmt.Errorf("failed to marshal command results: %w", err)
|
|
}
|
|
|
|
// Add AI response and command results to conversation
|
|
messages = append(messages, openai.ChatCompletionMessage{
|
|
Role: openai.ChatMessageRoleAssistant,
|
|
Content: content,
|
|
})
|
|
messages = append(messages, openai.ChatCompletionMessage{
|
|
Role: openai.ChatMessageRoleUser,
|
|
Content: string(resultsJSON),
|
|
})
|
|
|
|
continue
|
|
} else {
|
|
logging.Debug("Failed to parse as diagnostic. Error: %v, ResponseType: '%s'", err, diagnosticResp.ResponseType)
|
|
}
|
|
|
|
// Try to parse as resolution response
|
|
if err := json.Unmarshal([]byte(content), &resolutionResp); err == nil && resolutionResp.ResponseType == "resolution" {
|
|
// Handle resolution phase
|
|
logging.Info("=== DIAGNOSIS COMPLETE ===")
|
|
logging.Info("Root Cause: %s", resolutionResp.RootCause)
|
|
logging.Info("Resolution Plan: %s", resolutionResp.ResolutionPlan)
|
|
logging.Info("Confidence: %s", resolutionResp.Confidence)
|
|
break
|
|
}
|
|
|
|
// If we can't parse the response, treat it as an error or unexpected format
|
|
logging.Error("Unexpected response format or error from AI: %s", content)
|
|
break
|
|
}
|
|
|
|
return nil
|
|
}
|
|
|
|
// sendRequest sends a request to TensorZero via Supabase proxy (without episode ID)
|
|
func (a *LinuxDiagnosticAgent) SendRequest(messages []openai.ChatCompletionMessage) (*openai.ChatCompletionResponse, error) {
|
|
return a.SendRequestWithEpisode(messages, "")
|
|
}
|
|
|
|
// ExecuteCommand executes a command using the agent's executor
|
|
func (a *LinuxDiagnosticAgent) ExecuteCommand(cmd types.Command) types.CommandResult {
|
|
return a.executor.Execute(cmd)
|
|
}
|
|
|
|
// sendRequestWithEpisode sends a request to TensorZero via Supabase proxy with episode ID for conversation continuity
|
|
func (a *LinuxDiagnosticAgent) SendRequestWithEpisode(messages []openai.ChatCompletionMessage, episodeID string) (*openai.ChatCompletionResponse, error) {
|
|
// Convert messages to the expected format
|
|
messageMaps := make([]map[string]interface{}, len(messages))
|
|
for i, msg := range messages {
|
|
messageMaps[i] = map[string]interface{}{
|
|
"role": msg.Role,
|
|
"content": msg.Content,
|
|
}
|
|
}
|
|
|
|
// Create TensorZero request
|
|
tzRequest := map[string]interface{}{
|
|
"model": a.model,
|
|
"messages": messageMaps,
|
|
}
|
|
|
|
// Add episode ID if provided
|
|
if episodeID != "" {
|
|
tzRequest["tensorzero::episode_id"] = episodeID
|
|
}
|
|
|
|
// Marshal request
|
|
requestBody, err := json.Marshal(tzRequest)
|
|
if err != nil {
|
|
return nil, fmt.Errorf("failed to marshal request: %w", err)
|
|
}
|
|
|
|
// Get Supabase URL
|
|
supabaseURL := os.Getenv("SUPABASE_PROJECT_URL")
|
|
if supabaseURL == "" {
|
|
return nil, fmt.Errorf("SUPABASE_PROJECT_URL not set")
|
|
}
|
|
|
|
// Create HTTP request to TensorZero proxy (includes OpenAI-compatible path)
|
|
endpoint := fmt.Sprintf("%s/functions/v1/tensorzero-proxy/openai/v1/chat/completions", supabaseURL)
|
|
logging.Debug("Calling TensorZero proxy at: %s", endpoint)
|
|
req, err := http.NewRequest("POST", endpoint, bytes.NewBuffer(requestBody))
|
|
if err != nil {
|
|
return nil, fmt.Errorf("failed to create request: %w", err)
|
|
}
|
|
|
|
// Set headers
|
|
req.Header.Set("Content-Type", "application/json")
|
|
req.Header.Set("Accept", "application/json")
|
|
|
|
// Add authentication if auth manager is available (same pattern as investigation_server.go)
|
|
if a.authManager != nil {
|
|
// The authManager should be *auth.AuthManager, so let's use the exact same pattern
|
|
if authMgr, ok := a.authManager.(interface {
|
|
LoadToken() (*types.AuthToken, error)
|
|
}); ok {
|
|
if authToken, err := authMgr.LoadToken(); err == nil && authToken != nil {
|
|
req.Header.Set("Authorization", fmt.Sprintf("Bearer %s", authToken.AccessToken))
|
|
}
|
|
}
|
|
}
|
|
|
|
// Send request
|
|
client := &http.Client{Timeout: 30 * time.Second}
|
|
resp, err := client.Do(req)
|
|
if err != nil {
|
|
return nil, fmt.Errorf("failed to send request: %w", err)
|
|
}
|
|
defer resp.Body.Close()
|
|
|
|
// Check status code
|
|
if resp.StatusCode != 200 {
|
|
body, _ := io.ReadAll(resp.Body)
|
|
return nil, fmt.Errorf("TensorZero proxy error: %d, body: %s", resp.StatusCode, string(body))
|
|
}
|
|
|
|
// Parse response
|
|
var tzResponse map[string]interface{}
|
|
if err := json.NewDecoder(resp.Body).Decode(&tzResponse); err != nil {
|
|
return nil, fmt.Errorf("failed to decode response: %w", err)
|
|
}
|
|
|
|
// Convert to OpenAI format for compatibility
|
|
choices, ok := tzResponse["choices"].([]interface{})
|
|
if !ok || len(choices) == 0 {
|
|
return nil, fmt.Errorf("no choices in response")
|
|
}
|
|
|
|
// Extract the first choice
|
|
firstChoice, ok := choices[0].(map[string]interface{})
|
|
if !ok {
|
|
return nil, fmt.Errorf("invalid choice format")
|
|
}
|
|
|
|
message, ok := firstChoice["message"].(map[string]interface{})
|
|
if !ok {
|
|
return nil, fmt.Errorf("invalid message format")
|
|
}
|
|
|
|
content, ok := message["content"].(string)
|
|
if !ok {
|
|
return nil, fmt.Errorf("invalid content format")
|
|
}
|
|
|
|
// Create OpenAI-compatible response
|
|
response := &openai.ChatCompletionResponse{
|
|
Choices: []openai.ChatCompletionChoice{
|
|
{
|
|
Message: openai.ChatCompletionMessage{
|
|
Role: openai.ChatMessageRoleAssistant,
|
|
Content: content,
|
|
},
|
|
},
|
|
},
|
|
}
|
|
|
|
// Update episode ID if provided in response
|
|
if respEpisodeID, ok := tzResponse["episode_id"].(string); ok && respEpisodeID != "" {
|
|
a.episodeID = respEpisodeID
|
|
}
|
|
|
|
return response, nil
|
|
}
|
|
|
|
// ConvertEBPFProgramsToTraceSpecs converts old EBPFProgram format to new TraceSpec format
|
|
func (a *LinuxDiagnosticAgent) ConvertEBPFProgramsToTraceSpecs(ebpfPrograms []types.EBPFRequest) []ebpf.TraceSpec {
|
|
var traceSpecs []ebpf.TraceSpec
|
|
|
|
for _, prog := range ebpfPrograms {
|
|
spec := a.convertToTraceSpec(prog)
|
|
traceSpecs = append(traceSpecs, spec)
|
|
}
|
|
|
|
return traceSpecs
|
|
}
|
|
|
|
// convertToTraceSpec converts an EBPFRequest to a TraceSpec for BCC-style tracing
|
|
func (a *LinuxDiagnosticAgent) convertToTraceSpec(prog types.EBPFRequest) ebpf.TraceSpec {
|
|
// Determine probe type based on target and type
|
|
probeType := "p" // default to kprobe
|
|
target := prog.Target
|
|
|
|
if strings.HasPrefix(target, "tracepoint:") {
|
|
probeType = "t"
|
|
target = strings.TrimPrefix(target, "tracepoint:")
|
|
} else if strings.HasPrefix(target, "kprobe:") {
|
|
probeType = "p"
|
|
target = strings.TrimPrefix(target, "kprobe:")
|
|
} else if prog.Type == "tracepoint" {
|
|
probeType = "t"
|
|
} else if prog.Type == "syscall" {
|
|
// Convert syscall names to kprobe targets
|
|
if !strings.HasPrefix(target, "__x64_sys_") && !strings.Contains(target, ":") {
|
|
if strings.HasPrefix(target, "sys_") {
|
|
target = "__x64_" + target
|
|
} else {
|
|
target = "__x64_sys_" + target
|
|
}
|
|
}
|
|
probeType = "p"
|
|
}
|
|
|
|
// Set default duration if not specified
|
|
duration := prog.Duration
|
|
if duration <= 0 {
|
|
duration = 5 // default 5 seconds
|
|
}
|
|
|
|
return ebpf.TraceSpec{
|
|
ProbeType: probeType,
|
|
Target: target,
|
|
Format: prog.Description, // Use description as format
|
|
Arguments: []string{}, // Start with no arguments for compatibility
|
|
Duration: duration,
|
|
UID: -1, // No UID filter (don't default to 0 which means root only)
|
|
}
|
|
}
|
|
|
|
// executeEBPFTraces executes multiple eBPF traces using the eBPF service
|
|
func (a *LinuxDiagnosticAgent) ExecuteEBPFTraces(traceSpecs []ebpf.TraceSpec) []map[string]interface{} {
|
|
if len(traceSpecs) == 0 {
|
|
return []map[string]interface{}{}
|
|
}
|
|
|
|
a.logger.Info("Executing %d eBPF traces", len(traceSpecs))
|
|
|
|
results := make([]map[string]interface{}, 0, len(traceSpecs))
|
|
|
|
// Execute each trace using the eBPF manager
|
|
for i, spec := range traceSpecs {
|
|
a.logger.Debug("Starting trace %d: %s", i, spec.Target)
|
|
|
|
// Start the trace
|
|
traceID, err := a.ebpfManager.StartTrace(spec)
|
|
if err != nil {
|
|
a.logger.Error("Failed to start trace %d: %v", i, err)
|
|
result := map[string]interface{}{
|
|
"index": i,
|
|
"target": spec.Target,
|
|
"success": false,
|
|
"error": err.Error(),
|
|
}
|
|
results = append(results, result)
|
|
continue
|
|
}
|
|
|
|
// Wait for the trace duration
|
|
time.Sleep(time.Duration(spec.Duration) * time.Second)
|
|
|
|
// Get the trace result
|
|
traceResult, err := a.ebpfManager.GetTraceResult(traceID)
|
|
if err != nil {
|
|
a.logger.Error("Failed to get results for trace %d: %v", i, err)
|
|
result := map[string]interface{}{
|
|
"index": i,
|
|
"target": spec.Target,
|
|
"success": false,
|
|
"error": err.Error(),
|
|
}
|
|
results = append(results, result)
|
|
continue
|
|
}
|
|
|
|
// Build successful result
|
|
result := map[string]interface{}{
|
|
"index": i,
|
|
"target": spec.Target,
|
|
"success": true,
|
|
"event_count": traceResult.EventCount,
|
|
"events_per_second": traceResult.Statistics.EventsPerSecond,
|
|
"duration": traceResult.EndTime.Sub(traceResult.StartTime).Seconds(),
|
|
"summary": traceResult.Summary,
|
|
}
|
|
results = append(results, result)
|
|
|
|
a.logger.Debug("Completed trace %d: %d events", i, traceResult.EventCount)
|
|
}
|
|
|
|
a.logger.Info("Completed %d eBPF traces", len(results))
|
|
return results
|
|
}
|