nannyagent/agent.go

package main

import (
	"bytes"
	"context"
	"encoding/json"
	"fmt"
	"io"
	"net/http"
	"os"
	"time"

	"github.com/sashabaranov/go-openai"
)

// DiagnosticResponse represents the diagnostic phase response from AI
type DiagnosticResponse struct {
	ResponseType string    `json:"response_type"`
	Reasoning    string    `json:"reasoning"`
	Commands     []Command `json:"commands"`
}

// ResolutionResponse represents the resolution phase response from AI
type ResolutionResponse struct {
	ResponseType   string `json:"response_type"`
	RootCause      string `json:"root_cause"`
	ResolutionPlan string `json:"resolution_plan"`
	Confidence     string `json:"confidence"`
}

// Command represents a command to be executed
type Command struct {
	ID          string `json:"id"`
	Command     string `json:"command"`
	Description string `json:"description"`
}

// CommandResult represents the result of executing a command
type CommandResult struct {
	ID       string `json:"id"`
	Command  string `json:"command"`
	Output   string `json:"output"`
	ExitCode int    `json:"exit_code"`
	Error    string `json:"error,omitempty"`
}

// LinuxDiagnosticAgent represents the main agent
type LinuxDiagnosticAgent struct {
	client      *openai.Client
	model       string
	executor    *CommandExecutor
	episodeID   string               // TensorZero episode ID for conversation continuity
	ebpfManager EBPFManagerInterface // eBPF monitoring capabilities
}

// NewLinuxDiagnosticAgent creates a new diagnostic agent
func NewLinuxDiagnosticAgent() *LinuxDiagnosticAgent {
	// Get Supabase project URL for TensorZero proxy
	supabaseURL := os.Getenv("SUPABASE_PROJECT_URL")
	if supabaseURL == "" {
		fmt.Printf("Warning: SUPABASE_PROJECT_URL not set, TensorZero integration will not work\n")
		supabaseURL = "https://gpqzsricripnvbrpsyws.supabase.co" // fallback
	}

	model := os.Getenv("NANNYAPI_MODEL")
	if model == "" {
		model = "tensorzero::function_name::diagnose_and_heal"
		fmt.Printf("Warning: Using default model '%s'. Set NANNYAPI_MODEL environment variable for your specific function.\n", model)
	}

	// Note: We don't use the OpenAI client anymore, we use direct HTTP to Supabase proxy
	agent := &LinuxDiagnosticAgent{
		client:   nil, // Not used anymore
		model:    model,
		executor: NewCommandExecutor(10 * time.Second), // 10 second timeout for commands
	}

	// Initialize eBPF capabilities
	agent.ebpfManager = NewCiliumEBPFManager()

	return agent
}

// DiagnoseIssue starts the diagnostic process for a given issue
func (a *LinuxDiagnosticAgent) DiagnoseIssue(issue string) error {
	fmt.Printf("Diagnosing issue: %s\n", issue)
	fmt.Println("Gathering system information...")

	// Gather system information
	systemInfo := GatherSystemInfo()

	// Format the initial prompt with system information
	initialPrompt := FormatSystemInfoForPrompt(systemInfo) + "\n" + issue

	// Start conversation with initial issue including system info
	messages := []openai.ChatCompletionMessage{
		{
			Role:    openai.ChatMessageRoleUser,
			Content: initialPrompt,
		},
	}

	for {
		// Send request to TensorZero API via OpenAI SDK
		response, err := a.sendRequestWithEpisode(messages, a.episodeID)
		if err != nil {
			return fmt.Errorf("failed to send request: %w", err)
		}

		if len(response.Choices) == 0 {
			return fmt.Errorf("no choices in response")
		}

		content := response.Choices[0].Message.Content
		fmt.Printf("\nAI Response:\n%s\n", content)

		// Parse the response to determine next action
		var diagnosticResp EBPFEnhancedDiagnosticResponse
		var resolutionResp ResolutionResponse

		// Try to parse as diagnostic response first (with eBPF support)
		if err := json.Unmarshal([]byte(content), &diagnosticResp); err == nil && diagnosticResp.ResponseType == "diagnostic" {
			// Handle diagnostic phase
			fmt.Printf("\nReasoning: %s\n", diagnosticResp.Reasoning)

			// Execute commands and collect results
			commandResults := make([]CommandResult, 0, len(diagnosticResp.Commands))
			if len(diagnosticResp.Commands) > 0 {
				fmt.Printf("🔧 Executing diagnostic commands...\n")
				for _, cmd := range diagnosticResp.Commands {
					result := a.executor.Execute(cmd)
					commandResults = append(commandResults, result)

					if result.ExitCode != 0 {
						fmt.Printf("❌ Command '%s' failed with exit code %d\n", cmd.ID, result.ExitCode)
					}
				}
			}

			// Execute eBPF programs if present
			var ebpfResults []map[string]interface{}
			if len(diagnosticResp.EBPFPrograms) > 0 {
				ebpfResults = a.executeEBPFPrograms(diagnosticResp.EBPFPrograms)
			}

			// Prepare combined results as user message
			allResults := map[string]interface{}{
				"command_results":   commandResults,
				"executed_commands": len(commandResults),
			}

			// Include eBPF results if any were executed
			if len(ebpfResults) > 0 {
				allResults["ebpf_results"] = ebpfResults
				allResults["executed_ebpf_programs"] = len(ebpfResults)

				// Extract evidence summary for TensorZero
				evidenceSummary := make([]string, 0)
				for _, result := range ebpfResults {
					name := result["name"]
					eventCount := result["data_points"]
					description := result["description"]
					status := result["status"]

					summaryStr := fmt.Sprintf("%s: %v events (%s) - %s", name, eventCount, status, description)
					evidenceSummary = append(evidenceSummary, summaryStr)
				}
				allResults["ebpf_evidence_summary"] = evidenceSummary
			}

			resultsJSON, err := json.MarshalIndent(allResults, "", "  ")
			if err != nil {
				return fmt.Errorf("failed to marshal command results: %w", err)
			}

			// Add AI response and command results to conversation
			messages = append(messages, openai.ChatCompletionMessage{
				Role:    openai.ChatMessageRoleAssistant,
				Content: content,
			})
			messages = append(messages, openai.ChatCompletionMessage{
				Role:    openai.ChatMessageRoleUser,
				Content: string(resultsJSON),
			})

			continue
		}

		// Try to parse as resolution response
		if err := json.Unmarshal([]byte(content), &resolutionResp); err == nil && resolutionResp.ResponseType == "resolution" {
			// Handle resolution phase
			fmt.Printf("\n=== DIAGNOSIS COMPLETE ===\n")
			fmt.Printf("Root Cause: %s\n", resolutionResp.RootCause)
			fmt.Printf("Resolution Plan: %s\n", resolutionResp.ResolutionPlan)
			fmt.Printf("Confidence: %s\n", resolutionResp.Confidence)
			break
		}

		// If we can't parse the response, treat it as an error or unexpected format
		fmt.Printf("Unexpected response format or error from AI:\n%s\n", content)
		break
	}

	return nil
}

// executeEBPFPrograms executes REAL eBPF monitoring programs using the actual eBPF manager
func (a *LinuxDiagnosticAgent) executeEBPFPrograms(ebpfPrograms []EBPFRequest) []map[string]interface{} {
	var results []map[string]interface{}

	if a.ebpfManager == nil {
		fmt.Printf("❌ eBPF manager not initialized\n")
		return results
	}

	for _, prog := range ebpfPrograms {
		// eBPF program starting - only show in debug mode

		// Actually start the eBPF program using the real manager
		programID, err := a.ebpfManager.StartEBPFProgram(prog)
		if err != nil {
			fmt.Printf("❌ Failed to start eBPF program [%s]: %v\n", prog.Name, err)
			result := map[string]interface{}{
				"name":        prog.Name,
				"type":        prog.Type,
				"target":      prog.Target,
				"duration":    int(prog.Duration),
				"description": prog.Description,
				"status":      "failed",
				"error":       err.Error(),
				"success":     false,
			}
			results = append(results, result)
			continue
		}

		// Let the eBPF program run for the specified duration
		time.Sleep(time.Duration(prog.Duration) * time.Second)

		// Give the collectEvents goroutine a moment to finish and store results
		time.Sleep(500 * time.Millisecond)

		// Use a channel to implement timeout for GetProgramResults
		type resultPair struct {
			trace *EBPFTrace
			err   error
		}
		resultChan := make(chan resultPair, 1)

		go func() {
			trace, err := a.ebpfManager.GetProgramResults(programID)
			resultChan <- resultPair{trace, err}
		}()

		var trace *EBPFTrace
		var resultErr error

		select {
		case result := <-resultChan:
			trace = result.trace
			resultErr = result.err
		case <-time.After(3 * time.Second):
			resultErr = fmt.Errorf("timeout getting results after 3 seconds")
		}

		// Try to stop the program (may already be stopped by collectEvents)
		stopErr := a.ebpfManager.StopProgram(programID)
		if stopErr != nil {
			// Only show warning in debug mode - this is normal for completed programs
		}

		if resultErr != nil {
			fmt.Printf("❌ Failed to get results for eBPF program [%s]: %v\n", prog.Name, resultErr)
			result := map[string]interface{}{
				"name":        prog.Name,
				"type":        prog.Type,
				"target":      prog.Target,
				"duration":    int(prog.Duration),
				"description": prog.Description,
				"status":      "collection_failed",
				"error":       resultErr.Error(),
				"success":     false,
			}
			results = append(results, result)
			continue
		} // Process the real eBPF trace data
		result := map[string]interface{}{
			"name":        prog.Name,
			"type":        prog.Type,
			"target":      prog.Target,
			"duration":    int(prog.Duration),
			"description": prog.Description,
			"status":      "completed",
			"success":     true,
		}

		// Extract real data from the trace
		if trace != nil {
			result["trace_id"] = trace.TraceID
			result["data_points"] = trace.EventCount
			result["events"] = trace.Events
			result["summary"] = trace.Summary
			result["process_list"] = trace.ProcessList
			result["start_time"] = trace.StartTime.Format(time.RFC3339)
			result["end_time"] = trace.EndTime.Format(time.RFC3339)
			result["actual_duration"] = trace.EndTime.Sub(trace.StartTime).Seconds()

		} else {
			result["data_points"] = 0
			result["error"] = "No trace data returned"
			fmt.Printf("⚠️  eBPF program [%s] completed but returned no trace data\n", prog.Name)
		}

		results = append(results, result)
	}

	return results
}

// TensorZeroRequest represents a request structure compatible with TensorZero's episode_id
type TensorZeroRequest struct {
	Model     string                         `json:"model"`
	Messages  []openai.ChatCompletionMessage `json:"messages"`
	EpisodeID string                         `json:"tensorzero::episode_id,omitempty"`
}

// TensorZeroResponse represents TensorZero's response with episode_id
type TensorZeroResponse struct {
	openai.ChatCompletionResponse
	EpisodeID string `json:"episode_id"`
}

// sendRequest sends a request to the TensorZero API via Supabase proxy with JWT authentication
func (a *LinuxDiagnosticAgent) sendRequest(messages []openai.ChatCompletionMessage) (*openai.ChatCompletionResponse, error) {
	return a.sendRequestWithEpisode(messages, "")
}

// sendRequestWithEpisode sends a request with a specific episode ID
func (a *LinuxDiagnosticAgent) sendRequestWithEpisode(messages []openai.ChatCompletionMessage, episodeID string) (*openai.ChatCompletionResponse, error) {
	ctx, cancel := context.WithTimeout(context.Background(), 30*time.Second)
	defer cancel()

	// Create TensorZero-compatible request
	tzRequest := TensorZeroRequest{
		Model:    a.model,
		Messages: messages,
	}

	// Include tensorzero::episode_id for conversation continuity
	// Use agent's existing episode ID if available, otherwise use provided one
	if a.episodeID != "" {
		tzRequest.EpisodeID = a.episodeID
	} else if episodeID != "" {
		tzRequest.EpisodeID = episodeID
	}

	fmt.Printf("Debug: Sending request to model: %s", a.model)
	if a.episodeID != "" {
		fmt.Printf(" (episode: %s)", a.episodeID)
	}
	fmt.Println()

	// Marshal the request
	requestBody, err := json.Marshal(tzRequest)
	if err != nil {
		return nil, fmt.Errorf("failed to marshal request: %w", err)
	}

	// Get Supabase project URL and build TensorZero proxy endpoint
	supabaseURL := os.Getenv("SUPABASE_PROJECT_URL")
	if supabaseURL == "" {
		supabaseURL = "https://gpqzsricripnvbrpsyws.supabase.co"
	}

	// Build Supabase function URL with OpenAI v1 compatible path
	endpoint := supabaseURL + "/functions/v1/tensorzero-proxy/openai/v1/chat/completions"

	req, err := http.NewRequestWithContext(ctx, "POST", endpoint, bytes.NewBuffer(requestBody))
	if err != nil {
		return nil, fmt.Errorf("failed to create request: %w", err)
	}

	req.Header.Set("Content-Type", "application/json")

	// Add JWT authentication header
	accessToken, err := a.getAccessToken()
	if err != nil {
		return nil, fmt.Errorf("failed to get access token: %w", err)
	}

	req.Header.Set("Authorization", "Bearer "+accessToken)

	// Make the request
	client := &http.Client{Timeout: 30 * time.Second}
	resp, err := client.Do(req)
	if err != nil {
		return nil, fmt.Errorf("failed to send request: %w", err)
	}
	defer resp.Body.Close()

	// Read response body
	body, err := io.ReadAll(resp.Body)
	if err != nil {
		return nil, fmt.Errorf("failed to read response: %w", err)
	}

	if resp.StatusCode != http.StatusOK {
		return nil, fmt.Errorf("TensorZero API request failed with status %d: %s", resp.StatusCode, string(body))
	}

	// Parse TensorZero response
	var tzResponse TensorZeroResponse
	if err := json.Unmarshal(body, &tzResponse); err != nil {
		return nil, fmt.Errorf("failed to unmarshal response: %w", err)
	}

	// Extract episode_id from first response
	if a.episodeID == "" && tzResponse.EpisodeID != "" {
		a.episodeID = tzResponse.EpisodeID
		fmt.Printf("Debug: Extracted episode ID: %s\n", a.episodeID)
	}

	return &tzResponse.ChatCompletionResponse, nil
}

// getAccessToken retrieves the current access token for authentication
func (a *LinuxDiagnosticAgent) getAccessToken() (string, error) {
	// Read token from the standard token file location
	tokenPath := os.Getenv("TOKEN_PATH")
	if tokenPath == "" {
		tokenPath = "/var/lib/nannyagent/token.json"
	}

	tokenData, err := os.ReadFile(tokenPath)
	if err != nil {
		return "", fmt.Errorf("failed to read token file: %w", err)
	}

	var tokenInfo struct {
		AccessToken string `json:"access_token"`
	}

	if err := json.Unmarshal(tokenData, &tokenInfo); err != nil {
		return "", fmt.Errorf("failed to parse token file: %w", err)
	}

	if tokenInfo.AccessToken == "" {
		return "", fmt.Errorf("access token is empty")
	}

	return tokenInfo.AccessToken, nil
}