somewhat working ebpf bpftrace

This commit is contained in:
Harshavardhan Musanalli
2025-11-08 20:42:07 +01:00
parent 190e54dd38
commit 794111cb44
16 changed files with 2834 additions and 216 deletions

View File

@@ -13,7 +13,9 @@ import (
"time"
"nannyagentv2/internal/auth"
"nannyagentv2/internal/logging"
"nannyagentv2/internal/metrics"
"nannyagentv2/internal/types"
"github.com/gorilla/websocket"
"github.com/sashabaranov/go-openai"
@@ -74,7 +76,7 @@ func NewWebSocketClient(agent *LinuxDiagnosticAgent, authManager *auth.AuthManag
agentID = id
// Agent ID retrieved successfully
} else {
fmt.Printf("Failed to get agent ID from auth manager: %v\n", err)
logging.Error("Failed to get agent ID from auth manager: %v", err)
}
}
@@ -178,7 +180,7 @@ func (c *WebSocketClient) connect() error {
if err != nil {
c.consecutiveFailures++
if c.consecutiveFailures >= 5 && resp != nil {
fmt.Printf("WebSocket handshake failed with status: %d (failure #%d)\n", resp.StatusCode, c.consecutiveFailures)
logging.Error("WebSocket handshake failed with status: %d (failure #%d)", resp.StatusCode, c.consecutiveFailures)
}
return fmt.Errorf("websocket connection failed: %v", err)
}
@@ -205,7 +207,7 @@ func (c *WebSocketClient) handleMessages() {
case <-c.ctx.Done():
// Only log context cancellation if there have been failures
if c.consecutiveFailures >= 5 {
fmt.Printf("📡 Context cancelled after %v, stopping message handler\n", time.Since(connectionStart))
logging.Debug("Context cancelled after %v, stopping message handler", time.Since(connectionStart))
}
return
default:
@@ -223,14 +225,14 @@ func (c *WebSocketClient) handleMessages() {
// Only log specific errors after failure threshold
if c.consecutiveFailures >= 5 {
if websocket.IsCloseError(err, websocket.CloseNormalClosure, websocket.CloseGoingAway) {
log.Printf("🔒 WebSocket closed normally after %v: %v", connectionDuration, err)
logging.Debug("WebSocket closed normally after %v: %v", connectionDuration, err)
} else if websocket.IsUnexpectedCloseError(err, websocket.CloseGoingAway, websocket.CloseAbnormalClosure) {
log.Printf("💥 ABNORMAL CLOSE after %v (code 1006 = server-side timeout/kill): %v", connectionDuration, err)
log.Printf("🕒 Last read took %v, connection lived %v", readDuration, connectionDuration)
logging.Error("ABNORMAL CLOSE after %v (code 1006 = server-side timeout/kill): %v", connectionDuration, err)
logging.Debug("Last read took %v, connection lived %v", readDuration, connectionDuration)
} else if netErr, ok := err.(net.Error); ok && netErr.Timeout() {
log.Printf("READ TIMEOUT after %v: %v", connectionDuration, err)
logging.Warning("READ TIMEOUT after %v: %v", connectionDuration, err)
} else {
log.Printf("WebSocket error after %v: %v", connectionDuration, err)
logging.Error("WebSocket error after %v: %v", connectionDuration, err)
}
}
@@ -239,7 +241,7 @@ func (c *WebSocketClient) handleMessages() {
// Only show diagnostics after multiple failures
if c.consecutiveFailures >= 5 {
log.Printf("🔍 DIAGNOSTIC - Connection failed #%d after %v", c.consecutiveFailures, connectionDuration)
logging.Debug("DIAGNOSTIC - Connection failed #%d after %v", c.consecutiveFailures, connectionDuration)
}
// Attempt reconnection instead of returning immediately
@@ -265,7 +267,7 @@ func (c *WebSocketClient) handleMessages() {
// Task result acknowledged
default:
log.Printf("⚠️ Unknown message type: %s", message.Type)
logging.Warning("Unknown message type: %s", message.Type)
}
}
}
@@ -276,14 +278,14 @@ func (c *WebSocketClient) handleInvestigationTask(data interface{}) {
// Parse task data
taskBytes, err := json.Marshal(data)
if err != nil {
log.Printf("Error marshaling task data: %v", err)
logging.Error("Error marshaling task data: %v", err)
return
}
var task InvestigationTask
err = json.Unmarshal(taskBytes, &task)
if err != nil {
log.Printf("Error unmarshaling investigation task: %v", err)
logging.Error("Error unmarshaling investigation task: %v", err)
return
}
@@ -300,7 +302,7 @@ func (c *WebSocketClient) handleInvestigationTask(data interface{}) {
if err != nil {
taskResult.Error = err.Error()
fmt.Printf("Task execution failed: %v\n", err)
logging.Error("Task execution failed: %v", err)
} else {
taskResult.CommandResults = results
// Task executed successfully
@@ -356,7 +358,7 @@ func (c *WebSocketClient) executeDiagnosticCommands(diagnosticPayload map[string
if err != nil {
result["error"] = err.Error()
fmt.Printf("Command [%s] failed: %v (exit code: %d)\n", id, err, exitCode)
logging.Warning("Command [%s] failed: %v (exit code: %d)", id, err, exitCode)
}
commandResults = append(commandResults, result)
@@ -379,7 +381,7 @@ func (c *WebSocketClient) executeDiagnosticCommands(diagnosticPayload map[string
// executeEBPFPrograms executes eBPF monitoring programs using the real eBPF manager
func (c *WebSocketClient) executeEBPFPrograms(ebpfPrograms []interface{}) []map[string]interface{} {
var ebpfRequests []EBPFRequest
var ebpfRequests []types.EBPFRequest
// Convert interface{} to EBPFRequest structs
for _, prog := range ebpfPrograms {
@@ -398,7 +400,7 @@ func (c *WebSocketClient) executeEBPFPrograms(ebpfPrograms []interface{}) []map[
continue
}
ebpfRequests = append(ebpfRequests, EBPFRequest{
ebpfRequests = append(ebpfRequests, types.EBPFRequest{
Name: name,
Type: progType,
Target: target,
@@ -444,7 +446,7 @@ func (c *WebSocketClient) executeCommandsFromPayload(commands []interface{}) []m
if err != nil {
result["error"] = err.Error()
fmt.Printf("Command [%s] failed: %v (exit code: %d)\n", id, err, exitCode)
logging.Warning("Command [%s] failed: %v (exit code: %d)", id, err, exitCode)
}
commandResults = append(commandResults, result)
@@ -502,7 +504,7 @@ func (c *WebSocketClient) sendTaskResult(result TaskResult) {
err := c.conn.WriteJSON(message)
if err != nil {
log.Printf("Error sending task result: %v", err)
logging.Error("Error sending task result: %v", err)
}
}
@@ -516,7 +518,7 @@ func (c *WebSocketClient) startHeartbeat() {
for {
select {
case <-c.ctx.Done():
fmt.Printf("💓 Heartbeat stopped due to context cancellation\n")
logging.Debug("Heartbeat stopped due to context cancellation")
return
case <-ticker.C:
// Sending heartbeat
@@ -531,8 +533,8 @@ func (c *WebSocketClient) startHeartbeat() {
err := c.conn.WriteJSON(heartbeat)
if err != nil {
log.Printf("Error sending heartbeat: %v", err)
fmt.Printf("💓 Heartbeat failed, connection likely dead\n")
logging.Error("Error sending heartbeat: %v", err)
logging.Debug("Heartbeat failed, connection likely dead")
return
}
// Heartbeat sent
@@ -656,14 +658,14 @@ func (c *WebSocketClient) handlePendingInvestigation(investigation PendingInvest
for {
tzResp, tzErr := c.agent.sendRequestWithEpisode(messages, episodeID)
if tzErr != nil {
fmt.Printf("⚠️ TensorZero continuation failed: %v\n", tzErr)
logging.Warning("TensorZero continuation failed: %v", tzErr)
// Fall back to marking completed with command results only
c.updateInvestigationStatus(investigation.ID, "completed", resultsForDB, nil)
return
}
if len(tzResp.Choices) == 0 {
fmt.Printf("⚠️ No choices in TensorZero response\n")
logging.Warning("No choices in TensorZero response")
c.updateInvestigationStatus(investigation.ID, "completed", resultsForDB, nil)
return
}
@@ -672,7 +674,7 @@ func (c *WebSocketClient) handlePendingInvestigation(investigation PendingInvest
if len(aiContent) > 300 {
// AI response received successfully
} else {
fmt.Printf("🤖 AI Response: %s\n", aiContent)
logging.Debug("AI Response: %s", aiContent)
}
// Check if this is a resolution response (final)
@@ -683,14 +685,14 @@ func (c *WebSocketClient) handlePendingInvestigation(investigation PendingInvest
Confidence string `json:"confidence"`
}
fmt.Printf("🔍 Analyzing AI response type...\n")
logging.Debug("Analyzing AI response type...")
if err := json.Unmarshal([]byte(aiContent), &resolutionResp); err == nil && resolutionResp.ResponseType == "resolution" {
// This is the final resolution - show summary and complete
fmt.Printf("\n=== DIAGNOSIS COMPLETE ===\n")
fmt.Printf("Root Cause: %s\n", resolutionResp.RootCause)
fmt.Printf("Resolution Plan: %s\n", resolutionResp.ResolutionPlan)
fmt.Printf("Confidence: %s\n", resolutionResp.Confidence)
logging.Info("=== DIAGNOSIS COMPLETE ===")
logging.Info("Root Cause: %s", resolutionResp.RootCause)
logging.Info("Resolution Plan: %s", resolutionResp.ResolutionPlan)
logging.Info("Confidence: %s", resolutionResp.Confidence)
finalAIContent = aiContent
break
}
@@ -703,7 +705,7 @@ func (c *WebSocketClient) handlePendingInvestigation(investigation PendingInvest
}
if err := json.Unmarshal([]byte(aiContent), &diagnosticResp); err == nil && diagnosticResp.ResponseType == "diagnostic" {
fmt.Printf("🔄 AI requested additional diagnostics, executing...\n")
logging.Debug("AI requested additional diagnostics, executing...")
// Execute additional commands if any
additionalResults := map[string]interface{}{
@@ -711,7 +713,7 @@ func (c *WebSocketClient) handlePendingInvestigation(investigation PendingInvest
}
if len(diagnosticResp.Commands) > 0 {
fmt.Printf("🔧 Executing %d additional diagnostic commands...\n", len(diagnosticResp.Commands))
logging.Debug("Executing %d additional diagnostic commands", len(diagnosticResp.Commands))
commandResults := c.executeCommandsFromPayload(diagnosticResp.Commands)
additionalResults["command_results"] = commandResults
}
@@ -738,7 +740,7 @@ func (c *WebSocketClient) handlePendingInvestigation(investigation PendingInvest
}
// If neither resolution nor diagnostic, treat as final response
fmt.Printf("⚠️ Unknown response type - treating as final response\n")
logging.Warning("Unknown response type - treating as final response")
finalAIContent = aiContent
break
}
@@ -814,21 +816,21 @@ func (c *WebSocketClient) attemptReconnection() {
// Only show messages after 5 consecutive failures
if c.consecutiveFailures >= 5 {
log.Printf("🔄 Attempting WebSocket reconnection (attempt %d/%d) - %d consecutive failures", i+1, len(backoffDurations), c.consecutiveFailures)
logging.Info("Attempting WebSocket reconnection (attempt %d/%d) - %d consecutive failures", i+1, len(backoffDurations), c.consecutiveFailures)
}
time.Sleep(backoff)
if err := c.connect(); err != nil {
if c.consecutiveFailures >= 5 {
log.Printf("Reconnection attempt %d failed: %v", i+1, err)
logging.Warning("Reconnection attempt %d failed: %v", i+1, err)
}
continue
}
// Successfully reconnected - reset failure counter
if c.consecutiveFailures >= 5 {
log.Printf("WebSocket reconnected successfully after %d failures", c.consecutiveFailures)
logging.Info("WebSocket reconnected successfully after %d failures", c.consecutiveFailures)
}
c.consecutiveFailures = 0
go c.handleMessages() // Restart message handling
@@ -836,5 +838,5 @@ func (c *WebSocketClient) attemptReconnection() {
}
}
log.Printf("Failed to reconnect after %d attempts, giving up", len(backoffDurations))
logging.Error("Failed to reconnect after %d attempts, giving up", len(backoffDurations))
}