Somewhat okay refactoring
This commit is contained in:
587
internal/ebpf/ebpf_trace_manager.go
Normal file
587
internal/ebpf/ebpf_trace_manager.go
Normal file
@@ -0,0 +1,587 @@
|
||||
package ebpf
|
||||
|
||||
import (
|
||||
"context"
|
||||
"fmt"
|
||||
"io"
|
||||
"os"
|
||||
"os/exec"
|
||||
"strings"
|
||||
"sync"
|
||||
"time"
|
||||
|
||||
"nannyagentv2/internal/logging"
|
||||
)
|
||||
|
||||
// TraceSpec represents a trace specification similar to BCC trace.py
|
||||
type TraceSpec struct {
|
||||
// Probe type: "p" (kprobe), "r" (kretprobe), "t" (tracepoint), "u" (uprobe)
|
||||
ProbeType string `json:"probe_type"`
|
||||
|
||||
// Target function/syscall/tracepoint
|
||||
Target string `json:"target"`
|
||||
|
||||
// Library for userspace probes (empty for kernel)
|
||||
Library string `json:"library,omitempty"`
|
||||
|
||||
// Format string for output (e.g., "read %d bytes", arg3)
|
||||
Format string `json:"format"`
|
||||
|
||||
// Arguments to extract (e.g., ["arg1", "arg2", "retval"])
|
||||
Arguments []string `json:"arguments"`
|
||||
|
||||
// Filter condition (e.g., "arg3 > 20000")
|
||||
Filter string `json:"filter,omitempty"`
|
||||
|
||||
// Duration in seconds
|
||||
Duration int `json:"duration"`
|
||||
|
||||
// Process ID filter (optional)
|
||||
PID int `json:"pid,omitempty"`
|
||||
|
||||
// Thread ID filter (optional)
|
||||
TID int `json:"tid,omitempty"`
|
||||
|
||||
// UID filter (optional)
|
||||
UID int `json:"uid,omitempty"`
|
||||
|
||||
// Process name filter (optional)
|
||||
ProcessName string `json:"process_name,omitempty"`
|
||||
}
|
||||
|
||||
// TraceEvent represents a captured event from eBPF
|
||||
type TraceEvent struct {
|
||||
Timestamp int64 `json:"timestamp"`
|
||||
PID int `json:"pid"`
|
||||
TID int `json:"tid"`
|
||||
UID int `json:"uid"`
|
||||
ProcessName string `json:"process_name"`
|
||||
Function string `json:"function"`
|
||||
Message string `json:"message"`
|
||||
RawArgs map[string]string `json:"raw_args"`
|
||||
CPU int `json:"cpu,omitempty"`
|
||||
}
|
||||
|
||||
// TraceResult represents the results of a tracing session
|
||||
type TraceResult struct {
|
||||
TraceID string `json:"trace_id"`
|
||||
Spec TraceSpec `json:"spec"`
|
||||
Events []TraceEvent `json:"events"`
|
||||
EventCount int `json:"event_count"`
|
||||
StartTime time.Time `json:"start_time"`
|
||||
EndTime time.Time `json:"end_time"`
|
||||
Summary string `json:"summary"`
|
||||
Statistics TraceStats `json:"statistics"`
|
||||
}
|
||||
|
||||
// TraceStats provides statistics about the trace
|
||||
type TraceStats struct {
|
||||
TotalEvents int `json:"total_events"`
|
||||
EventsByProcess map[string]int `json:"events_by_process"`
|
||||
EventsByUID map[int]int `json:"events_by_uid"`
|
||||
EventsPerSecond float64 `json:"events_per_second"`
|
||||
TopProcesses []ProcessStat `json:"top_processes"`
|
||||
}
|
||||
|
||||
// ProcessStat represents statistics for a process
|
||||
type ProcessStat struct {
|
||||
ProcessName string `json:"process_name"`
|
||||
PID int `json:"pid"`
|
||||
EventCount int `json:"event_count"`
|
||||
Percentage float64 `json:"percentage"`
|
||||
}
|
||||
|
||||
// BCCTraceManager implements advanced eBPF tracing similar to BCC trace.py
|
||||
type BCCTraceManager struct {
|
||||
traces map[string]*RunningTrace
|
||||
tracesLock sync.RWMutex
|
||||
traceCounter int
|
||||
capabilities map[string]bool
|
||||
}
|
||||
|
||||
// RunningTrace represents an active trace session
|
||||
type RunningTrace struct {
|
||||
ID string
|
||||
Spec TraceSpec
|
||||
Process *exec.Cmd
|
||||
Events []TraceEvent
|
||||
StartTime time.Time
|
||||
Cancel context.CancelFunc
|
||||
Context context.Context
|
||||
Done chan struct{} // Signal when trace monitoring is complete
|
||||
}
|
||||
|
||||
// NewBCCTraceManager creates a new BCC-style trace manager
|
||||
func NewBCCTraceManager() *BCCTraceManager {
|
||||
manager := &BCCTraceManager{
|
||||
traces: make(map[string]*RunningTrace),
|
||||
capabilities: make(map[string]bool),
|
||||
}
|
||||
|
||||
manager.testCapabilities()
|
||||
return manager
|
||||
}
|
||||
|
||||
// testCapabilities checks what tracing capabilities are available
|
||||
func (tm *BCCTraceManager) testCapabilities() {
|
||||
// Test if bpftrace is available
|
||||
if _, err := exec.LookPath("bpftrace"); err == nil {
|
||||
tm.capabilities["bpftrace"] = true
|
||||
} else {
|
||||
tm.capabilities["bpftrace"] = false
|
||||
}
|
||||
|
||||
// Test if perf is available for fallback
|
||||
if _, err := exec.LookPath("perf"); err == nil {
|
||||
tm.capabilities["perf"] = true
|
||||
} else {
|
||||
tm.capabilities["perf"] = false
|
||||
}
|
||||
|
||||
// Test root privileges (required for eBPF)
|
||||
tm.capabilities["root_access"] = os.Geteuid() == 0
|
||||
|
||||
// Test kernel version
|
||||
cmd := exec.Command("uname", "-r")
|
||||
output, err := cmd.Output()
|
||||
if err == nil {
|
||||
version := strings.TrimSpace(string(output))
|
||||
// eBPF requires kernel 4.4+
|
||||
tm.capabilities["kernel_ebpf"] = !strings.HasPrefix(version, "3.")
|
||||
} else {
|
||||
tm.capabilities["kernel_ebpf"] = false
|
||||
}
|
||||
|
||||
// Test if we can access debugfs
|
||||
if _, err := os.Stat("/sys/kernel/debug/tracing/available_events"); err == nil {
|
||||
tm.capabilities["debugfs_access"] = true
|
||||
} else {
|
||||
tm.capabilities["debugfs_access"] = false
|
||||
}
|
||||
|
||||
logging.Debug("BCC Trace capabilities: %+v", tm.capabilities)
|
||||
}
|
||||
|
||||
// GetCapabilities returns available tracing capabilities
|
||||
func (tm *BCCTraceManager) GetCapabilities() map[string]bool {
|
||||
tm.tracesLock.RLock()
|
||||
defer tm.tracesLock.RUnlock()
|
||||
|
||||
caps := make(map[string]bool)
|
||||
for k, v := range tm.capabilities {
|
||||
caps[k] = v
|
||||
}
|
||||
return caps
|
||||
}
|
||||
|
||||
// StartTrace starts a new trace session based on the specification
|
||||
func (tm *BCCTraceManager) StartTrace(spec TraceSpec) (string, error) {
|
||||
if !tm.capabilities["bpftrace"] {
|
||||
return "", fmt.Errorf("bpftrace not available - install bpftrace package")
|
||||
}
|
||||
|
||||
if !tm.capabilities["root_access"] {
|
||||
return "", fmt.Errorf("root access required for eBPF tracing")
|
||||
}
|
||||
|
||||
if !tm.capabilities["kernel_ebpf"] {
|
||||
return "", fmt.Errorf("kernel version does not support eBPF")
|
||||
}
|
||||
|
||||
tm.tracesLock.Lock()
|
||||
defer tm.tracesLock.Unlock()
|
||||
|
||||
// Generate trace ID
|
||||
tm.traceCounter++
|
||||
traceID := fmt.Sprintf("trace_%d", tm.traceCounter)
|
||||
|
||||
// Generate bpftrace script
|
||||
script, err := tm.generateBpftraceScript(spec)
|
||||
if err != nil {
|
||||
return "", fmt.Errorf("failed to generate bpftrace script: %w", err)
|
||||
}
|
||||
|
||||
// Debug: log the generated script
|
||||
logging.Debug("Generated bpftrace script for %s:\n%s", spec.Target, script)
|
||||
|
||||
// Create context with timeout
|
||||
ctx, cancel := context.WithTimeout(context.Background(), time.Duration(spec.Duration)*time.Second)
|
||||
|
||||
// Start bpftrace process
|
||||
cmd := exec.CommandContext(ctx, "bpftrace", "-e", script)
|
||||
|
||||
// Create stdout pipe BEFORE starting
|
||||
stdout, err := cmd.StdoutPipe()
|
||||
if err != nil {
|
||||
cancel()
|
||||
return "", fmt.Errorf("failed to create stdout pipe: %w", err)
|
||||
}
|
||||
|
||||
trace := &RunningTrace{
|
||||
ID: traceID,
|
||||
Spec: spec,
|
||||
Process: cmd,
|
||||
Events: []TraceEvent{},
|
||||
StartTime: time.Now(),
|
||||
Cancel: cancel,
|
||||
Context: ctx,
|
||||
Done: make(chan struct{}), // Initialize completion signal
|
||||
}
|
||||
|
||||
// Start the trace
|
||||
if err := cmd.Start(); err != nil {
|
||||
cancel()
|
||||
return "", fmt.Errorf("failed to start bpftrace: %w", err)
|
||||
}
|
||||
|
||||
tm.traces[traceID] = trace
|
||||
|
||||
// Monitor the trace in a goroutine
|
||||
go tm.monitorTrace(traceID, stdout)
|
||||
|
||||
logging.Debug("Started BCC-style trace %s for target %s", traceID, spec.Target)
|
||||
return traceID, nil
|
||||
} // generateBpftraceScript generates a bpftrace script based on the trace specification
|
||||
func (tm *BCCTraceManager) generateBpftraceScript(spec TraceSpec) (string, error) {
|
||||
var script strings.Builder
|
||||
|
||||
// Build probe specification
|
||||
var probe string
|
||||
switch spec.ProbeType {
|
||||
case "p", "": // kprobe (default)
|
||||
if strings.HasPrefix(spec.Target, "sys_") || strings.HasPrefix(spec.Target, "__x64_sys_") {
|
||||
probe = fmt.Sprintf("kprobe:%s", spec.Target)
|
||||
} else {
|
||||
probe = fmt.Sprintf("kprobe:%s", spec.Target)
|
||||
}
|
||||
case "r": // kretprobe
|
||||
if strings.HasPrefix(spec.Target, "sys_") || strings.HasPrefix(spec.Target, "__x64_sys_") {
|
||||
probe = fmt.Sprintf("kretprobe:%s", spec.Target)
|
||||
} else {
|
||||
probe = fmt.Sprintf("kretprobe:%s", spec.Target)
|
||||
}
|
||||
case "t": // tracepoint
|
||||
// If target already includes tracepoint prefix, use as-is
|
||||
if strings.HasPrefix(spec.Target, "tracepoint:") {
|
||||
probe = spec.Target
|
||||
} else {
|
||||
probe = fmt.Sprintf("tracepoint:%s", spec.Target)
|
||||
}
|
||||
case "u": // uprobe
|
||||
if spec.Library == "" {
|
||||
return "", fmt.Errorf("library required for uprobe")
|
||||
}
|
||||
probe = fmt.Sprintf("uprobe:%s:%s", spec.Library, spec.Target)
|
||||
default:
|
||||
return "", fmt.Errorf("unsupported probe type: %s", spec.ProbeType)
|
||||
}
|
||||
|
||||
// Add BEGIN block
|
||||
script.WriteString("BEGIN {\n")
|
||||
script.WriteString(fmt.Sprintf(" printf(\"Starting trace for %s...\\n\");\n", spec.Target))
|
||||
script.WriteString("}\n\n")
|
||||
|
||||
// Build the main probe
|
||||
script.WriteString(fmt.Sprintf("%s {\n", probe))
|
||||
|
||||
// Add filters if specified
|
||||
if tm.needsFiltering(spec) {
|
||||
script.WriteString(" if (")
|
||||
filters := tm.buildFilters(spec)
|
||||
script.WriteString(strings.Join(filters, " && "))
|
||||
script.WriteString(") {\n")
|
||||
}
|
||||
|
||||
// Build output format
|
||||
outputFormat := tm.buildOutputFormat(spec)
|
||||
script.WriteString(fmt.Sprintf(" printf(\"%s\\n\"", outputFormat))
|
||||
|
||||
// Add arguments
|
||||
args := tm.buildArgumentList(spec)
|
||||
if len(args) > 0 {
|
||||
script.WriteString(", ")
|
||||
script.WriteString(strings.Join(args, ", "))
|
||||
}
|
||||
|
||||
script.WriteString(");\n")
|
||||
|
||||
// Close filter if block
|
||||
if tm.needsFiltering(spec) {
|
||||
script.WriteString(" }\n")
|
||||
}
|
||||
|
||||
script.WriteString("}\n\n")
|
||||
|
||||
// Add END block
|
||||
script.WriteString("END {\n")
|
||||
script.WriteString(fmt.Sprintf(" printf(\"Trace completed for %s\\n\");\n", spec.Target))
|
||||
script.WriteString("}\n")
|
||||
|
||||
return script.String(), nil
|
||||
}
|
||||
|
||||
// needsFiltering checks if any filters are needed
|
||||
func (tm *BCCTraceManager) needsFiltering(spec TraceSpec) bool {
|
||||
return spec.PID != 0 || spec.TID != 0 || spec.UID != -1 ||
|
||||
spec.ProcessName != "" || spec.Filter != ""
|
||||
}
|
||||
|
||||
// buildFilters builds the filter conditions
|
||||
func (tm *BCCTraceManager) buildFilters(spec TraceSpec) []string {
|
||||
var filters []string
|
||||
|
||||
if spec.PID != 0 {
|
||||
filters = append(filters, fmt.Sprintf("pid == %d", spec.PID))
|
||||
}
|
||||
|
||||
if spec.TID != 0 {
|
||||
filters = append(filters, fmt.Sprintf("tid == %d", spec.TID))
|
||||
}
|
||||
|
||||
if spec.UID != -1 {
|
||||
filters = append(filters, fmt.Sprintf("uid == %d", spec.UID))
|
||||
}
|
||||
|
||||
if spec.ProcessName != "" {
|
||||
filters = append(filters, fmt.Sprintf("strncmp(comm, \"%s\", %d) == 0", spec.ProcessName, len(spec.ProcessName)))
|
||||
}
|
||||
|
||||
// Add custom filter
|
||||
if spec.Filter != "" {
|
||||
// Convert common patterns to bpftrace syntax
|
||||
customFilter := strings.ReplaceAll(spec.Filter, "arg", "arg")
|
||||
filters = append(filters, customFilter)
|
||||
}
|
||||
|
||||
return filters
|
||||
}
|
||||
|
||||
// buildOutputFormat creates the output format string
|
||||
func (tm *BCCTraceManager) buildOutputFormat(spec TraceSpec) string {
|
||||
if spec.Format != "" {
|
||||
// Use custom format
|
||||
return fmt.Sprintf("TRACE|%%d|%%d|%%d|%%s|%s|%s", spec.Target, spec.Format)
|
||||
}
|
||||
|
||||
// Default format
|
||||
return fmt.Sprintf("TRACE|%%d|%%d|%%d|%%s|%s|called", spec.Target)
|
||||
}
|
||||
|
||||
// buildArgumentList creates the argument list for printf
|
||||
func (tm *BCCTraceManager) buildArgumentList(spec TraceSpec) []string {
|
||||
// Always include timestamp, pid, tid, comm
|
||||
args := []string{"nsecs", "pid", "tid", "comm"}
|
||||
|
||||
// Add custom arguments
|
||||
for _, arg := range spec.Arguments {
|
||||
switch arg {
|
||||
case "arg1", "arg2", "arg3", "arg4", "arg5", "arg6":
|
||||
args = append(args, fmt.Sprintf("arg%s", strings.TrimPrefix(arg, "arg")))
|
||||
case "retval":
|
||||
args = append(args, "retval")
|
||||
case "cpu":
|
||||
args = append(args, "cpu")
|
||||
default:
|
||||
// Custom expression
|
||||
args = append(args, arg)
|
||||
}
|
||||
}
|
||||
|
||||
return args
|
||||
}
|
||||
|
||||
// monitorTrace monitors a running trace and collects events
|
||||
func (tm *BCCTraceManager) monitorTrace(traceID string, stdout io.ReadCloser) {
|
||||
tm.tracesLock.Lock()
|
||||
trace, exists := tm.traces[traceID]
|
||||
if !exists {
|
||||
tm.tracesLock.Unlock()
|
||||
return
|
||||
}
|
||||
tm.tracesLock.Unlock()
|
||||
|
||||
// Start reading output in a goroutine
|
||||
go func() {
|
||||
scanner := NewEventScanner(stdout)
|
||||
for scanner.Scan() {
|
||||
event := scanner.Event()
|
||||
if event != nil {
|
||||
tm.tracesLock.Lock()
|
||||
if t, exists := tm.traces[traceID]; exists {
|
||||
t.Events = append(t.Events, *event)
|
||||
}
|
||||
tm.tracesLock.Unlock()
|
||||
}
|
||||
}
|
||||
stdout.Close()
|
||||
}()
|
||||
|
||||
// Wait for the process to complete
|
||||
err := trace.Process.Wait()
|
||||
|
||||
// Clean up
|
||||
trace.Cancel()
|
||||
|
||||
tm.tracesLock.Lock()
|
||||
if err != nil && err.Error() != "signal: killed" {
|
||||
logging.Warning("Trace %s completed with error: %v", traceID, err)
|
||||
} else {
|
||||
logging.Debug("Trace %s completed successfully with %d events",
|
||||
traceID, len(trace.Events))
|
||||
}
|
||||
|
||||
// Signal that monitoring is complete
|
||||
close(trace.Done)
|
||||
tm.tracesLock.Unlock()
|
||||
}
|
||||
|
||||
// GetTraceResult returns the results of a completed trace
|
||||
func (tm *BCCTraceManager) GetTraceResult(traceID string) (*TraceResult, error) {
|
||||
tm.tracesLock.RLock()
|
||||
trace, exists := tm.traces[traceID]
|
||||
if !exists {
|
||||
tm.tracesLock.RUnlock()
|
||||
return nil, fmt.Errorf("trace %s not found", traceID)
|
||||
}
|
||||
tm.tracesLock.RUnlock()
|
||||
|
||||
// Wait for trace monitoring to complete
|
||||
select {
|
||||
case <-trace.Done:
|
||||
// Trace monitoring completed
|
||||
case <-time.After(5 * time.Second):
|
||||
// Timeout waiting for completion
|
||||
return nil, fmt.Errorf("timeout waiting for trace %s to complete", traceID)
|
||||
}
|
||||
|
||||
// Now safely read the final results
|
||||
tm.tracesLock.RLock()
|
||||
defer tm.tracesLock.RUnlock()
|
||||
|
||||
result := &TraceResult{
|
||||
TraceID: traceID,
|
||||
Spec: trace.Spec,
|
||||
Events: make([]TraceEvent, len(trace.Events)),
|
||||
EventCount: len(trace.Events),
|
||||
StartTime: trace.StartTime,
|
||||
EndTime: time.Now(),
|
||||
}
|
||||
|
||||
copy(result.Events, trace.Events)
|
||||
|
||||
// Calculate statistics
|
||||
result.Statistics = tm.calculateStatistics(result.Events, result.EndTime.Sub(result.StartTime))
|
||||
|
||||
// Generate summary
|
||||
result.Summary = tm.generateSummary(result)
|
||||
|
||||
return result, nil
|
||||
}
|
||||
|
||||
// calculateStatistics calculates statistics for the trace results
|
||||
func (tm *BCCTraceManager) calculateStatistics(events []TraceEvent, duration time.Duration) TraceStats {
|
||||
stats := TraceStats{
|
||||
TotalEvents: len(events),
|
||||
EventsByProcess: make(map[string]int),
|
||||
EventsByUID: make(map[int]int),
|
||||
}
|
||||
|
||||
if duration > 0 {
|
||||
stats.EventsPerSecond = float64(len(events)) / duration.Seconds()
|
||||
}
|
||||
|
||||
// Calculate per-process and per-UID statistics
|
||||
for _, event := range events {
|
||||
stats.EventsByProcess[event.ProcessName]++
|
||||
stats.EventsByUID[event.UID]++
|
||||
}
|
||||
|
||||
// Calculate top processes
|
||||
for processName, count := range stats.EventsByProcess {
|
||||
percentage := float64(count) / float64(len(events)) * 100
|
||||
stats.TopProcesses = append(stats.TopProcesses, ProcessStat{
|
||||
ProcessName: processName,
|
||||
EventCount: count,
|
||||
Percentage: percentage,
|
||||
})
|
||||
}
|
||||
|
||||
return stats
|
||||
}
|
||||
|
||||
// generateSummary generates a human-readable summary
|
||||
func (tm *BCCTraceManager) generateSummary(result *TraceResult) string {
|
||||
duration := result.EndTime.Sub(result.StartTime)
|
||||
|
||||
summary := fmt.Sprintf("Traced %s for %v, captured %d events (%.2f events/sec)",
|
||||
result.Spec.Target, duration, result.EventCount, result.Statistics.EventsPerSecond)
|
||||
|
||||
if len(result.Statistics.TopProcesses) > 0 {
|
||||
summary += fmt.Sprintf(", top process: %s (%d events)",
|
||||
result.Statistics.TopProcesses[0].ProcessName,
|
||||
result.Statistics.TopProcesses[0].EventCount)
|
||||
}
|
||||
|
||||
return summary
|
||||
}
|
||||
|
||||
// StopTrace stops an active trace
|
||||
func (tm *BCCTraceManager) StopTrace(traceID string) error {
|
||||
tm.tracesLock.Lock()
|
||||
defer tm.tracesLock.Unlock()
|
||||
|
||||
trace, exists := tm.traces[traceID]
|
||||
if !exists {
|
||||
return fmt.Errorf("trace %s not found", traceID)
|
||||
}
|
||||
|
||||
if trace.Process.ProcessState == nil {
|
||||
// Process is still running, kill it
|
||||
if err := trace.Process.Process.Kill(); err != nil {
|
||||
return fmt.Errorf("failed to stop trace: %w", err)
|
||||
}
|
||||
}
|
||||
|
||||
trace.Cancel()
|
||||
return nil
|
||||
}
|
||||
|
||||
// ListActiveTraces returns a list of active trace IDs
|
||||
func (tm *BCCTraceManager) ListActiveTraces() []string {
|
||||
tm.tracesLock.RLock()
|
||||
defer tm.tracesLock.RUnlock()
|
||||
|
||||
var active []string
|
||||
for id, trace := range tm.traces {
|
||||
if trace.Process.ProcessState == nil {
|
||||
active = append(active, id)
|
||||
}
|
||||
}
|
||||
|
||||
return active
|
||||
}
|
||||
|
||||
// GetSummary returns a summary of the trace manager state
|
||||
func (tm *BCCTraceManager) GetSummary() map[string]interface{} {
|
||||
tm.tracesLock.RLock()
|
||||
defer tm.tracesLock.RUnlock()
|
||||
|
||||
activeCount := 0
|
||||
completedCount := 0
|
||||
|
||||
for _, trace := range tm.traces {
|
||||
if trace.Process.ProcessState == nil {
|
||||
activeCount++
|
||||
} else {
|
||||
completedCount++
|
||||
}
|
||||
}
|
||||
|
||||
return map[string]interface{}{
|
||||
"capabilities": tm.capabilities,
|
||||
"active_traces": activeCount,
|
||||
"completed_traces": completedCount,
|
||||
"total_traces": len(tm.traces),
|
||||
"active_trace_ids": tm.ListActiveTraces(),
|
||||
}
|
||||
}
|
||||
Reference in New Issue
Block a user