Agent and websocket investigations work fine

This commit is contained in:
Harshavardhan Musanalli
2025-10-27 19:13:39 +01:00
parent 0a8b2dc202
commit 8832450a1f
8 changed files with 1694 additions and 19 deletions

204
agent.go
View File

@@ -102,7 +102,7 @@ func (a *LinuxDiagnosticAgent) DiagnoseIssue(issue string) error {
for {
// Send request to TensorZero API via OpenAI SDK
response, err := a.sendRequest(messages)
response, err := a.sendRequestWithEpisode(messages, a.episodeID)
if err != nil {
return fmt.Errorf("failed to send request: %w", err)
}
@@ -115,34 +115,73 @@ func (a *LinuxDiagnosticAgent) DiagnoseIssue(issue string) error {
fmt.Printf("\nAI Response:\n%s\n", content)
// Parse the response to determine next action
var diagnosticResp DiagnosticResponse
var diagnosticResp EBPFEnhancedDiagnosticResponse
var resolutionResp ResolutionResponse
// Try to parse as diagnostic response first
// Try to parse as diagnostic response first (with eBPF support)
if err := json.Unmarshal([]byte(content), &diagnosticResp); err == nil && diagnosticResp.ResponseType == "diagnostic" {
// Handle diagnostic phase
fmt.Printf("\nReasoning: %s\n", diagnosticResp.Reasoning)
if len(diagnosticResp.Commands) == 0 {
fmt.Println("No commands to execute in diagnostic phase")
break
}
// Execute commands and collect results
commandResults := make([]CommandResult, 0, len(diagnosticResp.Commands))
for _, cmd := range diagnosticResp.Commands {
fmt.Printf("\nExecuting command '%s': %s\n", cmd.ID, cmd.Command)
result := a.executor.Execute(cmd)
commandResults = append(commandResults, result)
if len(diagnosticResp.Commands) > 0 {
fmt.Printf("🔧 Executing diagnostic commands...\n")
for _, cmd := range diagnosticResp.Commands {
fmt.Printf("⚙️ Executing command '%s': %s\n", cmd.ID, cmd.Command)
result := a.executor.Execute(cmd)
commandResults = append(commandResults, result)
fmt.Printf("Output:\n%s\n", result.Output)
if result.Error != "" {
fmt.Printf("Error: %s\n", result.Error)
if result.ExitCode == 0 {
fmt.Printf("✅ Command '%s' completed successfully\n", cmd.ID)
} else {
fmt.Printf("❌ Command '%s' failed with exit code %d\n", cmd.ID, result.ExitCode)
}
}
}
// Prepare command results as user message
resultsJSON, err := json.MarshalIndent(commandResults, "", " ")
// Execute eBPF programs if present
var ebpfResults []map[string]interface{}
if len(diagnosticResp.EBPFPrograms) > 0 {
fmt.Printf("🔬 Executing %d eBPF programs...\n", len(diagnosticResp.EBPFPrograms))
ebpfResults = a.executeEBPFPrograms(diagnosticResp.EBPFPrograms)
}
// Prepare combined results as user message
allResults := map[string]interface{}{
"command_results": commandResults,
"executed_commands": len(commandResults),
}
// Include eBPF results if any were executed
if len(ebpfResults) > 0 {
allResults["ebpf_results"] = ebpfResults
allResults["executed_ebpf_programs"] = len(ebpfResults)
// Extract evidence summary for TensorZero
evidenceSummary := make([]string, 0)
for _, result := range ebpfResults {
name := result["name"]
eventCount := result["data_points"]
description := result["description"]
status := result["status"]
summaryStr := fmt.Sprintf("%s: %v events (%s) - %s", name, eventCount, status, description)
evidenceSummary = append(evidenceSummary, summaryStr)
}
allResults["ebpf_evidence_summary"] = evidenceSummary
fmt.Printf("<22> Sending eBPF monitoring data to TensorZero:\n")
for _, summary := range evidenceSummary {
fmt.Printf(" - %s\n", summary)
}
fmt.Printf("✅ Executed %d commands, %d eBPF programs\n", len(commandResults), len(ebpfResults))
} else {
fmt.Printf("✅ Executed %d commands\n", len(commandResults))
}
resultsJSON, err := json.MarshalIndent(allResults, "", " ")
if err != nil {
return fmt.Errorf("failed to marshal command results: %w", err)
}
@@ -178,6 +217,127 @@ func (a *LinuxDiagnosticAgent) DiagnoseIssue(issue string) error {
return nil
}
// executeEBPFPrograms executes REAL eBPF monitoring programs using the actual eBPF manager
func (a *LinuxDiagnosticAgent) executeEBPFPrograms(ebpfPrograms []EBPFRequest) []map[string]interface{} {
var results []map[string]interface{}
if a.ebpfManager == nil {
fmt.Printf("❌ eBPF manager not initialized\n")
return results
}
for _, prog := range ebpfPrograms {
fmt.Printf("🔬 Starting eBPF program [%s]: %s -> %s (%ds)\n", prog.Name, prog.Type, prog.Target, int(prog.Duration))
// Actually start the eBPF program using the real manager
programID, err := a.ebpfManager.StartEBPFProgram(prog)
if err != nil {
fmt.Printf("❌ Failed to start eBPF program [%s]: %v\n", prog.Name, err)
result := map[string]interface{}{
"name": prog.Name,
"type": prog.Type,
"target": prog.Target,
"duration": int(prog.Duration),
"description": prog.Description,
"status": "failed",
"error": err.Error(),
"success": false,
}
results = append(results, result)
continue
}
// Let the eBPF program run for the specified duration
fmt.Printf("⏰ Waiting %d seconds for eBPF program to collect data...\n", int(prog.Duration))
time.Sleep(time.Duration(prog.Duration) * time.Second)
// Give the collectEvents goroutine a moment to finish and store results
fmt.Printf("⏳ Allowing program to complete data collection...\n")
time.Sleep(500 * time.Millisecond)
// Get the results (should be in completedResults now)
fmt.Printf("📊 Getting results for eBPF program [%s]...\n", prog.Name)
// Use a channel to implement timeout for GetProgramResults
type resultPair struct {
trace *EBPFTrace
err error
}
resultChan := make(chan resultPair, 1)
go func() {
trace, err := a.ebpfManager.GetProgramResults(programID)
resultChan <- resultPair{trace, err}
}()
var trace *EBPFTrace
var resultErr error
select {
case result := <-resultChan:
trace = result.trace
resultErr = result.err
case <-time.After(3 * time.Second):
resultErr = fmt.Errorf("timeout getting results after 3 seconds")
}
// Try to stop the program (may already be stopped by collectEvents)
fmt.Printf("🛑 Stopping eBPF program [%s]...\n", prog.Name)
stopErr := a.ebpfManager.StopProgram(programID)
if stopErr != nil {
fmt.Printf("⚠️ eBPF program [%s] cleanup: %v (may have already completed)\n", prog.Name, stopErr)
// Don't return here, we still want to process results if we got them
}
if resultErr != nil {
fmt.Printf("❌ Failed to get results for eBPF program [%s]: %v\n", prog.Name, resultErr)
result := map[string]interface{}{
"name": prog.Name,
"type": prog.Type,
"target": prog.Target,
"duration": int(prog.Duration),
"description": prog.Description,
"status": "collection_failed",
"error": resultErr.Error(),
"success": false,
}
results = append(results, result)
continue
} // Process the real eBPF trace data
result := map[string]interface{}{
"name": prog.Name,
"type": prog.Type,
"target": prog.Target,
"duration": int(prog.Duration),
"description": prog.Description,
"status": "completed",
"success": true,
}
// Extract real data from the trace
if trace != nil {
result["trace_id"] = trace.TraceID
result["data_points"] = trace.EventCount
result["events"] = trace.Events
result["summary"] = trace.Summary
result["process_list"] = trace.ProcessList
result["start_time"] = trace.StartTime.Format(time.RFC3339)
result["end_time"] = trace.EndTime.Format(time.RFC3339)
result["actual_duration"] = trace.EndTime.Sub(trace.StartTime).Seconds()
fmt.Printf("✅ eBPF program [%s] completed - collected %d real events\n", prog.Name, trace.EventCount)
} else {
result["data_points"] = 0
result["error"] = "No trace data returned"
fmt.Printf("⚠️ eBPF program [%s] completed but returned no trace data\n", prog.Name)
}
results = append(results, result)
}
return results
}
// TensorZeroRequest represents a request structure compatible with TensorZero's episode_id
type TensorZeroRequest struct {
Model string `json:"model"`
@@ -193,6 +353,11 @@ type TensorZeroResponse struct {
// sendRequest sends a request to the TensorZero API via Supabase proxy with JWT authentication
func (a *LinuxDiagnosticAgent) sendRequest(messages []openai.ChatCompletionMessage) (*openai.ChatCompletionResponse, error) {
return a.sendRequestWithEpisode(messages, "")
}
// sendRequestWithEpisode sends a request with a specific episode ID
func (a *LinuxDiagnosticAgent) sendRequestWithEpisode(messages []openai.ChatCompletionMessage, episodeID string) (*openai.ChatCompletionResponse, error) {
ctx, cancel := context.WithTimeout(context.Background(), 30*time.Second)
defer cancel()
@@ -202,9 +367,12 @@ func (a *LinuxDiagnosticAgent) sendRequest(messages []openai.ChatCompletionMessa
Messages: messages,
}
// Include tensorzero::episode_id for conversation continuity (if we have one)
// Include tensorzero::episode_id for conversation continuity
// Use agent's existing episode ID if available, otherwise use provided one
if a.episodeID != "" {
tzRequest.EpisodeID = a.episodeID
} else if episodeID != "" {
tzRequest.EpisodeID = episodeID
}
fmt.Printf("Debug: Sending request to model: %s", a.model)