474 lines
15 KiB
Go
474 lines
15 KiB
Go
package main
|
||
|
||
import (
|
||
"bytes"
|
||
"context"
|
||
"encoding/json"
|
||
"fmt"
|
||
"io"
|
||
"net/http"
|
||
"os"
|
||
"time"
|
||
|
||
"github.com/sashabaranov/go-openai"
|
||
)
|
||
|
||
// DiagnosticResponse represents the diagnostic phase response from AI
|
||
type DiagnosticResponse struct {
|
||
ResponseType string `json:"response_type"`
|
||
Reasoning string `json:"reasoning"`
|
||
Commands []Command `json:"commands"`
|
||
}
|
||
|
||
// ResolutionResponse represents the resolution phase response from AI
|
||
type ResolutionResponse struct {
|
||
ResponseType string `json:"response_type"`
|
||
RootCause string `json:"root_cause"`
|
||
ResolutionPlan string `json:"resolution_plan"`
|
||
Confidence string `json:"confidence"`
|
||
}
|
||
|
||
// Command represents a command to be executed
|
||
type Command struct {
|
||
ID string `json:"id"`
|
||
Command string `json:"command"`
|
||
Description string `json:"description"`
|
||
}
|
||
|
||
// CommandResult represents the result of executing a command
|
||
type CommandResult struct {
|
||
ID string `json:"id"`
|
||
Command string `json:"command"`
|
||
Output string `json:"output"`
|
||
ExitCode int `json:"exit_code"`
|
||
Error string `json:"error,omitempty"`
|
||
}
|
||
|
||
// LinuxDiagnosticAgent represents the main agent
|
||
type LinuxDiagnosticAgent struct {
|
||
client *openai.Client
|
||
model string
|
||
executor *CommandExecutor
|
||
episodeID string // TensorZero episode ID for conversation continuity
|
||
ebpfManager EBPFManagerInterface // eBPF monitoring capabilities
|
||
}
|
||
|
||
// NewLinuxDiagnosticAgent creates a new diagnostic agent
|
||
func NewLinuxDiagnosticAgent() *LinuxDiagnosticAgent {
|
||
// Get Supabase project URL for TensorZero proxy
|
||
supabaseURL := os.Getenv("SUPABASE_PROJECT_URL")
|
||
if supabaseURL == "" {
|
||
fmt.Printf("Warning: SUPABASE_PROJECT_URL not set, TensorZero integration will not work\n")
|
||
supabaseURL = "https://gpqzsricripnvbrpsyws.supabase.co" // fallback
|
||
}
|
||
|
||
model := os.Getenv("NANNYAPI_MODEL")
|
||
if model == "" {
|
||
model = "tensorzero::function_name::diagnose_and_heal"
|
||
fmt.Printf("Warning: Using default model '%s'. Set NANNYAPI_MODEL environment variable for your specific function.\n", model)
|
||
}
|
||
|
||
// Note: We don't use the OpenAI client anymore, we use direct HTTP to Supabase proxy
|
||
agent := &LinuxDiagnosticAgent{
|
||
client: nil, // Not used anymore
|
||
model: model,
|
||
executor: NewCommandExecutor(10 * time.Second), // 10 second timeout for commands
|
||
}
|
||
|
||
// Initialize eBPF capabilities
|
||
agent.ebpfManager = NewCiliumEBPFManager()
|
||
|
||
return agent
|
||
}
|
||
|
||
// DiagnoseIssue starts the diagnostic process for a given issue
|
||
func (a *LinuxDiagnosticAgent) DiagnoseIssue(issue string) error {
|
||
fmt.Printf("Diagnosing issue: %s\n", issue)
|
||
fmt.Println("Gathering system information...")
|
||
|
||
// Gather system information
|
||
systemInfo := GatherSystemInfo()
|
||
|
||
// Format the initial prompt with system information
|
||
initialPrompt := FormatSystemInfoForPrompt(systemInfo) + "\n" + issue
|
||
|
||
// Start conversation with initial issue including system info
|
||
messages := []openai.ChatCompletionMessage{
|
||
{
|
||
Role: openai.ChatMessageRoleUser,
|
||
Content: initialPrompt,
|
||
},
|
||
}
|
||
|
||
for {
|
||
// Send request to TensorZero API via OpenAI SDK
|
||
response, err := a.sendRequestWithEpisode(messages, a.episodeID)
|
||
if err != nil {
|
||
return fmt.Errorf("failed to send request: %w", err)
|
||
}
|
||
|
||
if len(response.Choices) == 0 {
|
||
return fmt.Errorf("no choices in response")
|
||
}
|
||
|
||
content := response.Choices[0].Message.Content
|
||
fmt.Printf("\nAI Response:\n%s\n", content)
|
||
|
||
// Parse the response to determine next action
|
||
var diagnosticResp EBPFEnhancedDiagnosticResponse
|
||
var resolutionResp ResolutionResponse
|
||
|
||
// Try to parse as diagnostic response first (with eBPF support)
|
||
if err := json.Unmarshal([]byte(content), &diagnosticResp); err == nil && diagnosticResp.ResponseType == "diagnostic" {
|
||
// Handle diagnostic phase
|
||
fmt.Printf("\nReasoning: %s\n", diagnosticResp.Reasoning)
|
||
|
||
// Execute commands and collect results
|
||
commandResults := make([]CommandResult, 0, len(diagnosticResp.Commands))
|
||
if len(diagnosticResp.Commands) > 0 {
|
||
fmt.Printf("🔧 Executing diagnostic commands...\n")
|
||
for _, cmd := range diagnosticResp.Commands {
|
||
fmt.Printf("⚙️ Executing command '%s': %s\n", cmd.ID, cmd.Command)
|
||
result := a.executor.Execute(cmd)
|
||
commandResults = append(commandResults, result)
|
||
|
||
if result.ExitCode == 0 {
|
||
fmt.Printf("✅ Command '%s' completed successfully\n", cmd.ID)
|
||
} else {
|
||
fmt.Printf("❌ Command '%s' failed with exit code %d\n", cmd.ID, result.ExitCode)
|
||
}
|
||
}
|
||
}
|
||
|
||
// Execute eBPF programs if present
|
||
var ebpfResults []map[string]interface{}
|
||
if len(diagnosticResp.EBPFPrograms) > 0 {
|
||
fmt.Printf("🔬 Executing %d eBPF programs...\n", len(diagnosticResp.EBPFPrograms))
|
||
ebpfResults = a.executeEBPFPrograms(diagnosticResp.EBPFPrograms)
|
||
}
|
||
|
||
// Prepare combined results as user message
|
||
allResults := map[string]interface{}{
|
||
"command_results": commandResults,
|
||
"executed_commands": len(commandResults),
|
||
}
|
||
|
||
// Include eBPF results if any were executed
|
||
if len(ebpfResults) > 0 {
|
||
allResults["ebpf_results"] = ebpfResults
|
||
allResults["executed_ebpf_programs"] = len(ebpfResults)
|
||
|
||
// Extract evidence summary for TensorZero
|
||
evidenceSummary := make([]string, 0)
|
||
for _, result := range ebpfResults {
|
||
name := result["name"]
|
||
eventCount := result["data_points"]
|
||
description := result["description"]
|
||
status := result["status"]
|
||
|
||
summaryStr := fmt.Sprintf("%s: %v events (%s) - %s", name, eventCount, status, description)
|
||
evidenceSummary = append(evidenceSummary, summaryStr)
|
||
}
|
||
allResults["ebpf_evidence_summary"] = evidenceSummary
|
||
|
||
fmt.Printf("<22> Sending eBPF monitoring data to TensorZero:\n")
|
||
for _, summary := range evidenceSummary {
|
||
fmt.Printf(" - %s\n", summary)
|
||
}
|
||
|
||
fmt.Printf("✅ Executed %d commands, %d eBPF programs\n", len(commandResults), len(ebpfResults))
|
||
} else {
|
||
fmt.Printf("✅ Executed %d commands\n", len(commandResults))
|
||
}
|
||
|
||
resultsJSON, err := json.MarshalIndent(allResults, "", " ")
|
||
if err != nil {
|
||
return fmt.Errorf("failed to marshal command results: %w", err)
|
||
}
|
||
|
||
// Add AI response and command results to conversation
|
||
messages = append(messages, openai.ChatCompletionMessage{
|
||
Role: openai.ChatMessageRoleAssistant,
|
||
Content: content,
|
||
})
|
||
messages = append(messages, openai.ChatCompletionMessage{
|
||
Role: openai.ChatMessageRoleUser,
|
||
Content: string(resultsJSON),
|
||
})
|
||
|
||
continue
|
||
}
|
||
|
||
// Try to parse as resolution response
|
||
if err := json.Unmarshal([]byte(content), &resolutionResp); err == nil && resolutionResp.ResponseType == "resolution" {
|
||
// Handle resolution phase
|
||
fmt.Printf("\n=== DIAGNOSIS COMPLETE ===\n")
|
||
fmt.Printf("Root Cause: %s\n", resolutionResp.RootCause)
|
||
fmt.Printf("Resolution Plan: %s\n", resolutionResp.ResolutionPlan)
|
||
fmt.Printf("Confidence: %s\n", resolutionResp.Confidence)
|
||
break
|
||
}
|
||
|
||
// If we can't parse the response, treat it as an error or unexpected format
|
||
fmt.Printf("Unexpected response format or error from AI:\n%s\n", content)
|
||
break
|
||
}
|
||
|
||
return nil
|
||
}
|
||
|
||
// executeEBPFPrograms executes REAL eBPF monitoring programs using the actual eBPF manager
|
||
func (a *LinuxDiagnosticAgent) executeEBPFPrograms(ebpfPrograms []EBPFRequest) []map[string]interface{} {
|
||
var results []map[string]interface{}
|
||
|
||
if a.ebpfManager == nil {
|
||
fmt.Printf("❌ eBPF manager not initialized\n")
|
||
return results
|
||
}
|
||
|
||
for _, prog := range ebpfPrograms {
|
||
fmt.Printf("🔬 Starting eBPF program [%s]: %s -> %s (%ds)\n", prog.Name, prog.Type, prog.Target, int(prog.Duration))
|
||
|
||
// Actually start the eBPF program using the real manager
|
||
programID, err := a.ebpfManager.StartEBPFProgram(prog)
|
||
if err != nil {
|
||
fmt.Printf("❌ Failed to start eBPF program [%s]: %v\n", prog.Name, err)
|
||
result := map[string]interface{}{
|
||
"name": prog.Name,
|
||
"type": prog.Type,
|
||
"target": prog.Target,
|
||
"duration": int(prog.Duration),
|
||
"description": prog.Description,
|
||
"status": "failed",
|
||
"error": err.Error(),
|
||
"success": false,
|
||
}
|
||
results = append(results, result)
|
||
continue
|
||
}
|
||
|
||
// Let the eBPF program run for the specified duration
|
||
fmt.Printf("⏰ Waiting %d seconds for eBPF program to collect data...\n", int(prog.Duration))
|
||
time.Sleep(time.Duration(prog.Duration) * time.Second)
|
||
|
||
// Give the collectEvents goroutine a moment to finish and store results
|
||
fmt.Printf("⏳ Allowing program to complete data collection...\n")
|
||
time.Sleep(500 * time.Millisecond)
|
||
|
||
// Get the results (should be in completedResults now)
|
||
fmt.Printf("📊 Getting results for eBPF program [%s]...\n", prog.Name)
|
||
|
||
// Use a channel to implement timeout for GetProgramResults
|
||
type resultPair struct {
|
||
trace *EBPFTrace
|
||
err error
|
||
}
|
||
resultChan := make(chan resultPair, 1)
|
||
|
||
go func() {
|
||
trace, err := a.ebpfManager.GetProgramResults(programID)
|
||
resultChan <- resultPair{trace, err}
|
||
}()
|
||
|
||
var trace *EBPFTrace
|
||
var resultErr error
|
||
|
||
select {
|
||
case result := <-resultChan:
|
||
trace = result.trace
|
||
resultErr = result.err
|
||
case <-time.After(3 * time.Second):
|
||
resultErr = fmt.Errorf("timeout getting results after 3 seconds")
|
||
}
|
||
|
||
// Try to stop the program (may already be stopped by collectEvents)
|
||
fmt.Printf("🛑 Stopping eBPF program [%s]...\n", prog.Name)
|
||
stopErr := a.ebpfManager.StopProgram(programID)
|
||
if stopErr != nil {
|
||
fmt.Printf("⚠️ eBPF program [%s] cleanup: %v (may have already completed)\n", prog.Name, stopErr)
|
||
// Don't return here, we still want to process results if we got them
|
||
}
|
||
|
||
if resultErr != nil {
|
||
fmt.Printf("❌ Failed to get results for eBPF program [%s]: %v\n", prog.Name, resultErr)
|
||
result := map[string]interface{}{
|
||
"name": prog.Name,
|
||
"type": prog.Type,
|
||
"target": prog.Target,
|
||
"duration": int(prog.Duration),
|
||
"description": prog.Description,
|
||
"status": "collection_failed",
|
||
"error": resultErr.Error(),
|
||
"success": false,
|
||
}
|
||
results = append(results, result)
|
||
continue
|
||
} // Process the real eBPF trace data
|
||
result := map[string]interface{}{
|
||
"name": prog.Name,
|
||
"type": prog.Type,
|
||
"target": prog.Target,
|
||
"duration": int(prog.Duration),
|
||
"description": prog.Description,
|
||
"status": "completed",
|
||
"success": true,
|
||
}
|
||
|
||
// Extract real data from the trace
|
||
if trace != nil {
|
||
result["trace_id"] = trace.TraceID
|
||
result["data_points"] = trace.EventCount
|
||
result["events"] = trace.Events
|
||
result["summary"] = trace.Summary
|
||
result["process_list"] = trace.ProcessList
|
||
result["start_time"] = trace.StartTime.Format(time.RFC3339)
|
||
result["end_time"] = trace.EndTime.Format(time.RFC3339)
|
||
result["actual_duration"] = trace.EndTime.Sub(trace.StartTime).Seconds()
|
||
|
||
fmt.Printf("✅ eBPF program [%s] completed - collected %d real events\n", prog.Name, trace.EventCount)
|
||
} else {
|
||
result["data_points"] = 0
|
||
result["error"] = "No trace data returned"
|
||
fmt.Printf("⚠️ eBPF program [%s] completed but returned no trace data\n", prog.Name)
|
||
}
|
||
|
||
results = append(results, result)
|
||
}
|
||
|
||
return results
|
||
}
|
||
|
||
// TensorZeroRequest represents a request structure compatible with TensorZero's episode_id
|
||
type TensorZeroRequest struct {
|
||
Model string `json:"model"`
|
||
Messages []openai.ChatCompletionMessage `json:"messages"`
|
||
EpisodeID string `json:"tensorzero::episode_id,omitempty"`
|
||
}
|
||
|
||
// TensorZeroResponse represents TensorZero's response with episode_id
|
||
type TensorZeroResponse struct {
|
||
openai.ChatCompletionResponse
|
||
EpisodeID string `json:"episode_id"`
|
||
}
|
||
|
||
// sendRequest sends a request to the TensorZero API via Supabase proxy with JWT authentication
|
||
func (a *LinuxDiagnosticAgent) sendRequest(messages []openai.ChatCompletionMessage) (*openai.ChatCompletionResponse, error) {
|
||
return a.sendRequestWithEpisode(messages, "")
|
||
}
|
||
|
||
// sendRequestWithEpisode sends a request with a specific episode ID
|
||
func (a *LinuxDiagnosticAgent) sendRequestWithEpisode(messages []openai.ChatCompletionMessage, episodeID string) (*openai.ChatCompletionResponse, error) {
|
||
ctx, cancel := context.WithTimeout(context.Background(), 30*time.Second)
|
||
defer cancel()
|
||
|
||
// Create TensorZero-compatible request
|
||
tzRequest := TensorZeroRequest{
|
||
Model: a.model,
|
||
Messages: messages,
|
||
}
|
||
|
||
// Include tensorzero::episode_id for conversation continuity
|
||
// Use agent's existing episode ID if available, otherwise use provided one
|
||
if a.episodeID != "" {
|
||
tzRequest.EpisodeID = a.episodeID
|
||
} else if episodeID != "" {
|
||
tzRequest.EpisodeID = episodeID
|
||
}
|
||
|
||
fmt.Printf("Debug: Sending request to model: %s", a.model)
|
||
if a.episodeID != "" {
|
||
fmt.Printf(" (episode: %s)", a.episodeID)
|
||
}
|
||
fmt.Println()
|
||
|
||
// Marshal the request
|
||
requestBody, err := json.Marshal(tzRequest)
|
||
if err != nil {
|
||
return nil, fmt.Errorf("failed to marshal request: %w", err)
|
||
}
|
||
|
||
// Get Supabase project URL and build TensorZero proxy endpoint
|
||
supabaseURL := os.Getenv("SUPABASE_PROJECT_URL")
|
||
if supabaseURL == "" {
|
||
supabaseURL = "https://gpqzsricripnvbrpsyws.supabase.co"
|
||
}
|
||
|
||
// Build Supabase function URL with OpenAI v1 compatible path
|
||
endpoint := supabaseURL + "/functions/v1/tensorzero-proxy/openai/v1/chat/completions"
|
||
|
||
req, err := http.NewRequestWithContext(ctx, "POST", endpoint, bytes.NewBuffer(requestBody))
|
||
if err != nil {
|
||
return nil, fmt.Errorf("failed to create request: %w", err)
|
||
}
|
||
|
||
req.Header.Set("Content-Type", "application/json")
|
||
|
||
// Add JWT authentication header
|
||
accessToken, err := a.getAccessToken()
|
||
if err != nil {
|
||
return nil, fmt.Errorf("failed to get access token: %w", err)
|
||
}
|
||
|
||
req.Header.Set("Authorization", "Bearer "+accessToken)
|
||
|
||
// Make the request
|
||
client := &http.Client{Timeout: 30 * time.Second}
|
||
resp, err := client.Do(req)
|
||
if err != nil {
|
||
return nil, fmt.Errorf("failed to send request: %w", err)
|
||
}
|
||
defer resp.Body.Close()
|
||
|
||
// Read response body
|
||
body, err := io.ReadAll(resp.Body)
|
||
if err != nil {
|
||
return nil, fmt.Errorf("failed to read response: %w", err)
|
||
}
|
||
|
||
if resp.StatusCode != http.StatusOK {
|
||
return nil, fmt.Errorf("TensorZero API request failed with status %d: %s", resp.StatusCode, string(body))
|
||
}
|
||
|
||
// Parse TensorZero response
|
||
var tzResponse TensorZeroResponse
|
||
if err := json.Unmarshal(body, &tzResponse); err != nil {
|
||
return nil, fmt.Errorf("failed to unmarshal response: %w", err)
|
||
}
|
||
|
||
// Extract episode_id from first response
|
||
if a.episodeID == "" && tzResponse.EpisodeID != "" {
|
||
a.episodeID = tzResponse.EpisodeID
|
||
fmt.Printf("Debug: Extracted episode ID: %s\n", a.episodeID)
|
||
}
|
||
|
||
return &tzResponse.ChatCompletionResponse, nil
|
||
}
|
||
|
||
// getAccessToken retrieves the current access token for authentication
|
||
func (a *LinuxDiagnosticAgent) getAccessToken() (string, error) {
|
||
// Read token from the standard token file location
|
||
tokenPath := os.Getenv("TOKEN_PATH")
|
||
if tokenPath == "" {
|
||
tokenPath = "/var/lib/nannyagent/token.json"
|
||
}
|
||
|
||
tokenData, err := os.ReadFile(tokenPath)
|
||
if err != nil {
|
||
return "", fmt.Errorf("failed to read token file: %w", err)
|
||
}
|
||
|
||
var tokenInfo struct {
|
||
AccessToken string `json:"access_token"`
|
||
}
|
||
|
||
if err := json.Unmarshal(tokenData, &tokenInfo); err != nil {
|
||
return "", fmt.Errorf("failed to parse token file: %w", err)
|
||
}
|
||
|
||
if tokenInfo.AccessToken == "" {
|
||
return "", fmt.Errorf("access token is empty")
|
||
}
|
||
|
||
return tokenInfo.AccessToken, nil
|
||
}
|