533 lines
18 KiB
Go
533 lines
18 KiB
Go
package main
|
|
|
|
import (
|
|
"encoding/json"
|
|
"fmt"
|
|
"net/http"
|
|
"os"
|
|
"strings"
|
|
"time"
|
|
|
|
"nannyagentv2/internal/auth"
|
|
"nannyagentv2/internal/logging"
|
|
"nannyagentv2/internal/metrics"
|
|
|
|
"github.com/sashabaranov/go-openai"
|
|
)
|
|
|
|
// InvestigationRequest represents a request from Supabase to start an investigation
|
|
type InvestigationRequest struct {
|
|
InvestigationID string `json:"investigation_id"`
|
|
ApplicationGroup string `json:"application_group"`
|
|
Issue string `json:"issue"`
|
|
Context map[string]string `json:"context"`
|
|
Priority string `json:"priority"`
|
|
InitiatedBy string `json:"initiated_by"`
|
|
}
|
|
|
|
// InvestigationResponse represents the agent's response to an investigation
|
|
type InvestigationResponse struct {
|
|
AgentID string `json:"agent_id"`
|
|
InvestigationID string `json:"investigation_id"`
|
|
Status string `json:"status"`
|
|
Commands []CommandResult `json:"commands,omitempty"`
|
|
AIResponse string `json:"ai_response,omitempty"`
|
|
EpisodeID string `json:"episode_id,omitempty"`
|
|
Timestamp time.Time `json:"timestamp"`
|
|
Error string `json:"error,omitempty"`
|
|
}
|
|
|
|
// InvestigationServer handles reverse investigation requests from Supabase
|
|
type InvestigationServer struct {
|
|
agent *LinuxDiagnosticAgent // Original agent for direct user interactions
|
|
applicationAgent *LinuxDiagnosticAgent // Separate agent for application-initiated investigations
|
|
port string
|
|
agentID string
|
|
metricsCollector *metrics.Collector
|
|
authManager *auth.AuthManager
|
|
startTime time.Time
|
|
supabaseURL string
|
|
}
|
|
|
|
// NewInvestigationServer creates a new investigation server
|
|
func NewInvestigationServer(agent *LinuxDiagnosticAgent, authManager *auth.AuthManager) *InvestigationServer {
|
|
port := os.Getenv("AGENT_PORT")
|
|
if port == "" {
|
|
port = "1234"
|
|
}
|
|
|
|
// Get agent ID from authentication system
|
|
var agentID string
|
|
if authManager != nil {
|
|
if id, err := authManager.GetCurrentAgentID(); err == nil {
|
|
agentID = id
|
|
|
|
} else {
|
|
logging.Error("Failed to get agent ID from auth manager: %v", err)
|
|
}
|
|
}
|
|
|
|
// Fallback to environment variable or generate one if auth fails
|
|
if agentID == "" {
|
|
agentID = os.Getenv("AGENT_ID")
|
|
if agentID == "" {
|
|
agentID = fmt.Sprintf("agent-%d", time.Now().Unix())
|
|
}
|
|
}
|
|
|
|
// Create metrics collector
|
|
metricsCollector := metrics.NewCollector("v2.0.0")
|
|
|
|
// Create a separate agent for application-initiated investigations
|
|
applicationAgent := NewLinuxDiagnosticAgent()
|
|
// Override the model to use the application-specific function
|
|
applicationAgent.model = "tensorzero::function_name::diagnose_and_heal_application"
|
|
|
|
return &InvestigationServer{
|
|
agent: agent,
|
|
applicationAgent: applicationAgent,
|
|
port: port,
|
|
agentID: agentID,
|
|
metricsCollector: metricsCollector,
|
|
authManager: authManager,
|
|
startTime: time.Now(),
|
|
supabaseURL: os.Getenv("SUPABASE_PROJECT_URL"),
|
|
}
|
|
}
|
|
|
|
// DiagnoseIssueForApplication handles diagnostic requests initiated from application/portal
|
|
func (s *InvestigationServer) DiagnoseIssueForApplication(issue, episodeID string) error {
|
|
// Set the episode ID on the application agent for continuity
|
|
s.applicationAgent.episodeID = episodeID
|
|
return s.applicationAgent.DiagnoseIssue(issue)
|
|
}
|
|
|
|
// Start starts the HTTP server and realtime polling for investigation requests
|
|
func (s *InvestigationServer) Start() error {
|
|
mux := http.NewServeMux()
|
|
|
|
// Health check endpoint
|
|
mux.HandleFunc("/health", s.handleHealth)
|
|
|
|
// Investigation endpoint
|
|
mux.HandleFunc("/investigate", s.handleInvestigation)
|
|
|
|
// Agent status endpoint
|
|
mux.HandleFunc("/status", s.handleStatus)
|
|
|
|
// Start realtime polling for backend-initiated investigations
|
|
if s.supabaseURL != "" && s.authManager != nil {
|
|
go s.startRealtimePolling()
|
|
logging.Info("Realtime investigation polling enabled")
|
|
} else {
|
|
logging.Warning("Realtime investigation polling disabled (missing Supabase config or auth)")
|
|
}
|
|
|
|
server := &http.Server{
|
|
Addr: ":" + s.port,
|
|
Handler: mux,
|
|
ReadTimeout: 30 * time.Second,
|
|
WriteTimeout: 30 * time.Second,
|
|
}
|
|
|
|
logging.Info("Investigation server started on port %s (Agent ID: %s)", s.port, s.agentID)
|
|
return server.ListenAndServe()
|
|
}
|
|
|
|
// handleHealth responds to health check requests
|
|
func (s *InvestigationServer) handleHealth(w http.ResponseWriter, r *http.Request) {
|
|
if r.Method != http.MethodGet {
|
|
http.Error(w, "Method not allowed", http.StatusMethodNotAllowed)
|
|
return
|
|
}
|
|
|
|
response := map[string]interface{}{
|
|
"status": "healthy",
|
|
"agent_id": s.agentID,
|
|
"timestamp": time.Now(),
|
|
"version": "v2.0.0",
|
|
}
|
|
|
|
w.Header().Set("Content-Type", "application/json")
|
|
json.NewEncoder(w).Encode(response)
|
|
}
|
|
|
|
// handleStatus responds with agent status and capabilities
|
|
func (s *InvestigationServer) handleStatus(w http.ResponseWriter, r *http.Request) {
|
|
if r.Method != http.MethodGet {
|
|
http.Error(w, "Method not allowed", http.StatusMethodNotAllowed)
|
|
return
|
|
}
|
|
|
|
// Collect current system metrics
|
|
systemMetrics, err := s.metricsCollector.GatherSystemMetrics()
|
|
if err != nil {
|
|
http.Error(w, fmt.Sprintf("Failed to collect metrics: %v", err), http.StatusInternalServerError)
|
|
return
|
|
}
|
|
|
|
// Convert to metrics request format for consistent data structure
|
|
metricsReq := s.metricsCollector.CreateMetricsRequest(s.agentID, systemMetrics)
|
|
|
|
response := map[string]interface{}{
|
|
"agent_id": s.agentID,
|
|
"status": "ready",
|
|
"capabilities": []string{"system_diagnostics", "ebpf_monitoring", "command_execution", "ai_analysis"},
|
|
"system_info": map[string]interface{}{
|
|
"os": fmt.Sprintf("%s %s", metricsReq.OSInfo["platform"], metricsReq.OSInfo["platform_version"]),
|
|
"kernel": metricsReq.KernelVersion,
|
|
"architecture": metricsReq.OSInfo["kernel_arch"],
|
|
"cpu_cores": metricsReq.OSInfo["cpu_cores"],
|
|
"memory": metricsReq.MemoryUsage,
|
|
"private_ips": metricsReq.IPAddress,
|
|
"load_average": fmt.Sprintf("%.2f, %.2f, %.2f",
|
|
metricsReq.LoadAverages["load1"],
|
|
metricsReq.LoadAverages["load5"],
|
|
metricsReq.LoadAverages["load15"]),
|
|
"disk_usage": fmt.Sprintf("Root: %.0fG/%.0fG (%.0f%% used)",
|
|
float64(metricsReq.FilesystemInfo[0].Used)/1024/1024/1024,
|
|
float64(metricsReq.FilesystemInfo[0].Total)/1024/1024/1024,
|
|
metricsReq.DiskUsage),
|
|
},
|
|
"uptime": time.Since(s.startTime),
|
|
"last_contact": time.Now(),
|
|
}
|
|
|
|
w.Header().Set("Content-Type", "application/json")
|
|
json.NewEncoder(w).Encode(response)
|
|
}
|
|
|
|
// sendCommandResultsToTensorZero sends command results back to TensorZero and continues conversation
|
|
func (s *InvestigationServer) sendCommandResultsToTensorZero(diagnosticResp DiagnosticResponse, commandResults []CommandResult) (interface{}, error) {
|
|
// Build conversation history like in agent.go
|
|
messages := []openai.ChatCompletionMessage{
|
|
// Add the original diagnostic response as assistant message
|
|
{
|
|
Role: openai.ChatMessageRoleAssistant,
|
|
Content: fmt.Sprintf(`{"response_type":"diagnostic","reasoning":"%s","commands":%s}`,
|
|
diagnosticResp.Reasoning,
|
|
mustMarshalJSON(diagnosticResp.Commands)),
|
|
},
|
|
}
|
|
|
|
// Add command results as user message (same as agent.go does)
|
|
resultsJSON, err := json.MarshalIndent(commandResults, "", " ")
|
|
if err != nil {
|
|
return nil, fmt.Errorf("failed to marshal command results: %w", err)
|
|
}
|
|
|
|
messages = append(messages, openai.ChatCompletionMessage{
|
|
Role: openai.ChatMessageRoleUser,
|
|
Content: string(resultsJSON),
|
|
})
|
|
|
|
// Send to TensorZero via application agent's sendRequest method
|
|
logging.Debug("Sending command results to TensorZero for analysis")
|
|
response, err := s.applicationAgent.sendRequest(messages)
|
|
if err != nil {
|
|
return nil, fmt.Errorf("failed to send request to TensorZero: %w", err)
|
|
}
|
|
|
|
if len(response.Choices) == 0 {
|
|
return nil, fmt.Errorf("no choices in TensorZero response")
|
|
}
|
|
|
|
content := response.Choices[0].Message.Content
|
|
logging.Debug("TensorZero continued analysis: %s", content)
|
|
|
|
// Try to parse the response to determine if it's diagnostic or resolution
|
|
var diagnosticNextResp DiagnosticResponse
|
|
var resolutionResp ResolutionResponse
|
|
|
|
// Check if it's another diagnostic response
|
|
if err := json.Unmarshal([]byte(content), &diagnosticNextResp); err == nil && diagnosticNextResp.ResponseType == "diagnostic" {
|
|
logging.Debug("TensorZero requests %d more commands", len(diagnosticNextResp.Commands))
|
|
return map[string]interface{}{
|
|
"type": "diagnostic",
|
|
"response": diagnosticNextResp,
|
|
"raw": content,
|
|
}, nil
|
|
}
|
|
|
|
// Check if it's a resolution response
|
|
if err := json.Unmarshal([]byte(content), &resolutionResp); err == nil && resolutionResp.ResponseType == "resolution" {
|
|
|
|
return map[string]interface{}{
|
|
"type": "resolution",
|
|
"response": resolutionResp,
|
|
"raw": content,
|
|
}, nil
|
|
}
|
|
|
|
// Return raw response if we can't parse it
|
|
return map[string]interface{}{
|
|
"type": "unknown",
|
|
"raw": content,
|
|
}, nil
|
|
}
|
|
|
|
// Helper function to marshal JSON without errors
|
|
func mustMarshalJSON(v interface{}) string {
|
|
data, _ := json.Marshal(v)
|
|
return string(data)
|
|
}
|
|
|
|
// processInvestigation handles the actual investigation using TensorZero
|
|
// This endpoint receives either:
|
|
// 1. DiagnosticResponse - Commands and eBPF programs to execute
|
|
// 2. ResolutionResponse - Final resolution (no execution needed)
|
|
func (s *InvestigationServer) handleInvestigation(w http.ResponseWriter, r *http.Request) {
|
|
if r.Method != http.MethodPost {
|
|
http.Error(w, "Method not allowed - only POST accepted", http.StatusMethodNotAllowed)
|
|
return
|
|
}
|
|
|
|
// Parse the request body to determine what type of response this is
|
|
var requestBody map[string]interface{}
|
|
if err := json.NewDecoder(r.Body).Decode(&requestBody); err != nil {
|
|
http.Error(w, fmt.Sprintf("Invalid JSON: %v", err), http.StatusBadRequest)
|
|
return
|
|
}
|
|
|
|
// Check the response_type field to determine how to handle this
|
|
responseType, ok := requestBody["response_type"].(string)
|
|
if !ok {
|
|
http.Error(w, "Missing or invalid response_type field", http.StatusBadRequest)
|
|
return
|
|
}
|
|
|
|
logging.Debug("Received investigation payload with response_type: %s", responseType)
|
|
|
|
switch responseType {
|
|
case "diagnostic":
|
|
// This is a DiagnosticResponse with commands to execute
|
|
response := s.handleDiagnosticExecution(requestBody)
|
|
w.Header().Set("Content-Type", "application/json")
|
|
json.NewEncoder(w).Encode(response)
|
|
|
|
case "resolution":
|
|
// This is a ResolutionResponse - final result, just acknowledge
|
|
fmt.Printf("📋 Received final resolution from backend\n")
|
|
w.Header().Set("Content-Type", "application/json")
|
|
json.NewEncoder(w).Encode(map[string]interface{}{
|
|
"success": true,
|
|
"message": "Resolution received and acknowledged",
|
|
"agent_id": s.agentID,
|
|
})
|
|
|
|
default:
|
|
http.Error(w, fmt.Sprintf("Unknown response_type: %s", responseType), http.StatusBadRequest)
|
|
return
|
|
}
|
|
}
|
|
|
|
// handleDiagnosticExecution executes commands from a DiagnosticResponse
|
|
func (s *InvestigationServer) handleDiagnosticExecution(requestBody map[string]interface{}) map[string]interface{} {
|
|
// Parse as DiagnosticResponse
|
|
var diagnosticResp DiagnosticResponse
|
|
|
|
// Convert the map back to JSON and then parse it properly
|
|
jsonData, err := json.Marshal(requestBody)
|
|
if err != nil {
|
|
return map[string]interface{}{
|
|
"success": false,
|
|
"error": fmt.Sprintf("Failed to re-marshal request: %v", err),
|
|
"agent_id": s.agentID,
|
|
}
|
|
}
|
|
|
|
if err := json.Unmarshal(jsonData, &diagnosticResp); err != nil {
|
|
return map[string]interface{}{
|
|
"success": false,
|
|
"error": fmt.Sprintf("Failed to parse DiagnosticResponse: %v", err),
|
|
"agent_id": s.agentID,
|
|
}
|
|
}
|
|
|
|
fmt.Printf("📋 Executing %d commands from backend\n", len(diagnosticResp.Commands))
|
|
|
|
// Execute all commands
|
|
commandResults := make([]CommandResult, 0, len(diagnosticResp.Commands))
|
|
|
|
for i, cmdStr := range diagnosticResp.Commands {
|
|
// Convert string to Command struct
|
|
cmd := Command{
|
|
ID: fmt.Sprintf("cmd_%d", i),
|
|
Command: cmdStr,
|
|
Description: fmt.Sprintf("Investigation command: %s", cmdStr),
|
|
}
|
|
fmt.Printf("⚙️ Executing command '%s': %s\n", cmd.ID, cmd.Command)
|
|
|
|
// Use the agent's executor to run the command
|
|
result := s.agent.executor.Execute(cmd)
|
|
commandResults = append(commandResults, result)
|
|
|
|
if result.Error != "" {
|
|
fmt.Printf("⚠️ Command '%s' had error: %s\n", cmd.ID, result.Error)
|
|
}
|
|
}
|
|
|
|
// Send command results back to TensorZero for continued analysis
|
|
fmt.Printf("🔄 Sending %d command results back to TensorZero for continued analysis\n", len(commandResults))
|
|
|
|
nextResponse, err := s.sendCommandResultsToTensorZero(diagnosticResp, commandResults)
|
|
if err != nil {
|
|
return map[string]interface{}{
|
|
"success": false,
|
|
"error": fmt.Sprintf("Failed to continue TensorZero conversation: %v", err),
|
|
"agent_id": s.agentID,
|
|
"command_results": commandResults, // Still return the results
|
|
}
|
|
}
|
|
|
|
// Return both the command results and the next response from TensorZero
|
|
return map[string]interface{}{
|
|
"success": true,
|
|
"agent_id": s.agentID,
|
|
"command_results": commandResults,
|
|
"commands_executed": len(commandResults),
|
|
"next_response": nextResponse,
|
|
"timestamp": time.Now().Format(time.RFC3339),
|
|
}
|
|
}
|
|
|
|
// PendingInvestigation represents a pending investigation from the database
|
|
type PendingInvestigation struct {
|
|
ID string `json:"id"`
|
|
InvestigationID string `json:"investigation_id"`
|
|
AgentID string `json:"agent_id"`
|
|
DiagnosticPayload map[string]interface{} `json:"diagnostic_payload"`
|
|
EpisodeID *string `json:"episode_id"`
|
|
Status string `json:"status"`
|
|
CreatedAt time.Time `json:"created_at"`
|
|
}
|
|
|
|
// startRealtimePolling begins polling for pending investigations
|
|
func (s *InvestigationServer) startRealtimePolling() {
|
|
fmt.Printf("🔄 Starting realtime investigation polling for agent %s\n", s.agentID)
|
|
|
|
ticker := time.NewTicker(5 * time.Second) // Poll every 5 seconds
|
|
defer ticker.Stop()
|
|
|
|
for range ticker.C {
|
|
s.checkForPendingInvestigations()
|
|
}
|
|
}
|
|
|
|
// checkForPendingInvestigations checks for new pending investigations
|
|
func (s *InvestigationServer) checkForPendingInvestigations() {
|
|
url := fmt.Sprintf("%s/rest/v1/pending_investigations?agent_id=eq.%s&status=eq.pending&order=created_at.desc",
|
|
s.supabaseURL, s.agentID)
|
|
|
|
req, err := http.NewRequest("GET", url, nil)
|
|
if err != nil {
|
|
return // Silent fail for polling
|
|
}
|
|
|
|
// Get token from auth manager
|
|
authToken, err := s.authManager.LoadToken()
|
|
if err != nil {
|
|
return // Silent fail for polling
|
|
}
|
|
|
|
req.Header.Set("Authorization", fmt.Sprintf("Bearer %s", authToken.AccessToken))
|
|
req.Header.Set("Accept", "application/json")
|
|
|
|
client := &http.Client{Timeout: 10 * time.Second}
|
|
resp, err := client.Do(req)
|
|
if err != nil {
|
|
return // Silent fail for polling
|
|
}
|
|
defer resp.Body.Close()
|
|
|
|
if resp.StatusCode != 200 {
|
|
return // Silent fail for polling
|
|
}
|
|
|
|
var investigations []PendingInvestigation
|
|
err = json.NewDecoder(resp.Body).Decode(&investigations)
|
|
if err != nil {
|
|
return // Silent fail for polling
|
|
}
|
|
|
|
for _, investigation := range investigations {
|
|
fmt.Printf("🔍 Found pending investigation: %s\n", investigation.ID)
|
|
go s.handlePendingInvestigation(investigation)
|
|
}
|
|
}
|
|
|
|
// handlePendingInvestigation processes a single pending investigation
|
|
func (s *InvestigationServer) handlePendingInvestigation(investigation PendingInvestigation) {
|
|
fmt.Printf("🚀 Processing realtime investigation %s\n", investigation.InvestigationID)
|
|
|
|
// Mark as executing
|
|
err := s.updateInvestigationStatus(investigation.ID, "executing", nil, nil)
|
|
if err != nil {
|
|
fmt.Printf("❌ Failed to mark investigation as executing: %v\n", err)
|
|
return
|
|
}
|
|
|
|
// Execute diagnostic commands using existing handleDiagnosticExecution method
|
|
results := s.handleDiagnosticExecution(investigation.DiagnosticPayload)
|
|
|
|
// Mark as completed with results
|
|
err = s.updateInvestigationStatus(investigation.ID, "completed", results, nil)
|
|
if err != nil {
|
|
fmt.Printf("❌ Failed to mark investigation as completed: %v\n", err)
|
|
return
|
|
}
|
|
|
|
}
|
|
|
|
// updateInvestigationStatus updates the status of a pending investigation
|
|
func (s *InvestigationServer) updateInvestigationStatus(id, status string, results map[string]interface{}, errorMsg *string) error {
|
|
updateData := map[string]interface{}{
|
|
"status": status,
|
|
}
|
|
|
|
if status == "executing" {
|
|
updateData["started_at"] = time.Now().UTC().Format(time.RFC3339)
|
|
} else if status == "completed" {
|
|
updateData["completed_at"] = time.Now().UTC().Format(time.RFC3339)
|
|
if results != nil {
|
|
updateData["command_results"] = results
|
|
}
|
|
} else if status == "failed" && errorMsg != nil {
|
|
updateData["error_message"] = *errorMsg
|
|
updateData["completed_at"] = time.Now().UTC().Format(time.RFC3339)
|
|
}
|
|
|
|
jsonData, err := json.Marshal(updateData)
|
|
if err != nil {
|
|
return fmt.Errorf("failed to marshal update data: %v", err)
|
|
}
|
|
|
|
url := fmt.Sprintf("%s/rest/v1/pending_investigations?id=eq.%s", s.supabaseURL, id)
|
|
req, err := http.NewRequest("PATCH", url, strings.NewReader(string(jsonData)))
|
|
if err != nil {
|
|
return fmt.Errorf("failed to create request: %v", err)
|
|
}
|
|
|
|
// Get token from auth manager
|
|
authToken, err := s.authManager.LoadToken()
|
|
if err != nil {
|
|
return fmt.Errorf("failed to load auth token: %v", err)
|
|
}
|
|
|
|
req.Header.Set("Authorization", fmt.Sprintf("Bearer %s", authToken.AccessToken))
|
|
req.Header.Set("Content-Type", "application/json")
|
|
|
|
client := &http.Client{Timeout: 10 * time.Second}
|
|
resp, err := client.Do(req)
|
|
if err != nil {
|
|
return fmt.Errorf("failed to update investigation: %v", err)
|
|
}
|
|
defer resp.Body.Close()
|
|
|
|
if resp.StatusCode != 200 && resp.StatusCode != 204 {
|
|
return fmt.Errorf("supabase update error: %d", resp.StatusCode)
|
|
}
|
|
|
|
return nil
|
|
}
|