588 lines
16 KiB
Go
588 lines
16 KiB
Go
package main
|
|
|
|
import (
|
|
"context"
|
|
"fmt"
|
|
"io"
|
|
"os"
|
|
"os/exec"
|
|
"strings"
|
|
"sync"
|
|
"time"
|
|
|
|
"nannyagentv2/internal/logging"
|
|
)
|
|
|
|
// TraceSpec represents a trace specification similar to BCC trace.py
|
|
type TraceSpec struct {
|
|
// Probe type: "p" (kprobe), "r" (kretprobe), "t" (tracepoint), "u" (uprobe)
|
|
ProbeType string `json:"probe_type"`
|
|
|
|
// Target function/syscall/tracepoint
|
|
Target string `json:"target"`
|
|
|
|
// Library for userspace probes (empty for kernel)
|
|
Library string `json:"library,omitempty"`
|
|
|
|
// Format string for output (e.g., "read %d bytes", arg3)
|
|
Format string `json:"format"`
|
|
|
|
// Arguments to extract (e.g., ["arg1", "arg2", "retval"])
|
|
Arguments []string `json:"arguments"`
|
|
|
|
// Filter condition (e.g., "arg3 > 20000")
|
|
Filter string `json:"filter,omitempty"`
|
|
|
|
// Duration in seconds
|
|
Duration int `json:"duration"`
|
|
|
|
// Process ID filter (optional)
|
|
PID int `json:"pid,omitempty"`
|
|
|
|
// Thread ID filter (optional)
|
|
TID int `json:"tid,omitempty"`
|
|
|
|
// UID filter (optional)
|
|
UID int `json:"uid,omitempty"`
|
|
|
|
// Process name filter (optional)
|
|
ProcessName string `json:"process_name,omitempty"`
|
|
}
|
|
|
|
// TraceEvent represents a captured event from eBPF
|
|
type TraceEvent struct {
|
|
Timestamp int64 `json:"timestamp"`
|
|
PID int `json:"pid"`
|
|
TID int `json:"tid"`
|
|
UID int `json:"uid"`
|
|
ProcessName string `json:"process_name"`
|
|
Function string `json:"function"`
|
|
Message string `json:"message"`
|
|
RawArgs map[string]string `json:"raw_args"`
|
|
CPU int `json:"cpu,omitempty"`
|
|
}
|
|
|
|
// TraceResult represents the results of a tracing session
|
|
type TraceResult struct {
|
|
TraceID string `json:"trace_id"`
|
|
Spec TraceSpec `json:"spec"`
|
|
Events []TraceEvent `json:"events"`
|
|
EventCount int `json:"event_count"`
|
|
StartTime time.Time `json:"start_time"`
|
|
EndTime time.Time `json:"end_time"`
|
|
Summary string `json:"summary"`
|
|
Statistics TraceStats `json:"statistics"`
|
|
}
|
|
|
|
// TraceStats provides statistics about the trace
|
|
type TraceStats struct {
|
|
TotalEvents int `json:"total_events"`
|
|
EventsByProcess map[string]int `json:"events_by_process"`
|
|
EventsByUID map[int]int `json:"events_by_uid"`
|
|
EventsPerSecond float64 `json:"events_per_second"`
|
|
TopProcesses []ProcessStat `json:"top_processes"`
|
|
}
|
|
|
|
// ProcessStat represents statistics for a process
|
|
type ProcessStat struct {
|
|
ProcessName string `json:"process_name"`
|
|
PID int `json:"pid"`
|
|
EventCount int `json:"event_count"`
|
|
Percentage float64 `json:"percentage"`
|
|
}
|
|
|
|
// BCCTraceManager implements advanced eBPF tracing similar to BCC trace.py
|
|
type BCCTraceManager struct {
|
|
traces map[string]*RunningTrace
|
|
tracesLock sync.RWMutex
|
|
traceCounter int
|
|
capabilities map[string]bool
|
|
}
|
|
|
|
// RunningTrace represents an active trace session
|
|
type RunningTrace struct {
|
|
ID string
|
|
Spec TraceSpec
|
|
Process *exec.Cmd
|
|
Events []TraceEvent
|
|
StartTime time.Time
|
|
Cancel context.CancelFunc
|
|
Context context.Context
|
|
Done chan struct{} // Signal when trace monitoring is complete
|
|
}
|
|
|
|
// NewBCCTraceManager creates a new BCC-style trace manager
|
|
func NewBCCTraceManager() *BCCTraceManager {
|
|
manager := &BCCTraceManager{
|
|
traces: make(map[string]*RunningTrace),
|
|
capabilities: make(map[string]bool),
|
|
}
|
|
|
|
manager.testCapabilities()
|
|
return manager
|
|
}
|
|
|
|
// testCapabilities checks what tracing capabilities are available
|
|
func (tm *BCCTraceManager) testCapabilities() {
|
|
// Test if bpftrace is available
|
|
if _, err := exec.LookPath("bpftrace"); err == nil {
|
|
tm.capabilities["bpftrace"] = true
|
|
} else {
|
|
tm.capabilities["bpftrace"] = false
|
|
}
|
|
|
|
// Test if perf is available for fallback
|
|
if _, err := exec.LookPath("perf"); err == nil {
|
|
tm.capabilities["perf"] = true
|
|
} else {
|
|
tm.capabilities["perf"] = false
|
|
}
|
|
|
|
// Test root privileges (required for eBPF)
|
|
tm.capabilities["root_access"] = os.Geteuid() == 0
|
|
|
|
// Test kernel version
|
|
cmd := exec.Command("uname", "-r")
|
|
output, err := cmd.Output()
|
|
if err == nil {
|
|
version := strings.TrimSpace(string(output))
|
|
// eBPF requires kernel 4.4+
|
|
tm.capabilities["kernel_ebpf"] = !strings.HasPrefix(version, "3.")
|
|
} else {
|
|
tm.capabilities["kernel_ebpf"] = false
|
|
}
|
|
|
|
// Test if we can access debugfs
|
|
if _, err := os.Stat("/sys/kernel/debug/tracing/available_events"); err == nil {
|
|
tm.capabilities["debugfs_access"] = true
|
|
} else {
|
|
tm.capabilities["debugfs_access"] = false
|
|
}
|
|
|
|
logging.Debug("BCC Trace capabilities: %+v", tm.capabilities)
|
|
}
|
|
|
|
// GetCapabilities returns available tracing capabilities
|
|
func (tm *BCCTraceManager) GetCapabilities() map[string]bool {
|
|
tm.tracesLock.RLock()
|
|
defer tm.tracesLock.RUnlock()
|
|
|
|
caps := make(map[string]bool)
|
|
for k, v := range tm.capabilities {
|
|
caps[k] = v
|
|
}
|
|
return caps
|
|
}
|
|
|
|
// StartTrace starts a new trace session based on the specification
|
|
func (tm *BCCTraceManager) StartTrace(spec TraceSpec) (string, error) {
|
|
if !tm.capabilities["bpftrace"] {
|
|
return "", fmt.Errorf("bpftrace not available - install bpftrace package")
|
|
}
|
|
|
|
if !tm.capabilities["root_access"] {
|
|
return "", fmt.Errorf("root access required for eBPF tracing")
|
|
}
|
|
|
|
if !tm.capabilities["kernel_ebpf"] {
|
|
return "", fmt.Errorf("kernel version does not support eBPF")
|
|
}
|
|
|
|
tm.tracesLock.Lock()
|
|
defer tm.tracesLock.Unlock()
|
|
|
|
// Generate trace ID
|
|
tm.traceCounter++
|
|
traceID := fmt.Sprintf("trace_%d", tm.traceCounter)
|
|
|
|
// Generate bpftrace script
|
|
script, err := tm.generateBpftraceScript(spec)
|
|
if err != nil {
|
|
return "", fmt.Errorf("failed to generate bpftrace script: %w", err)
|
|
}
|
|
|
|
// Debug: log the generated script
|
|
logging.Debug("Generated bpftrace script for %s:\n%s", spec.Target, script)
|
|
|
|
// Create context with timeout
|
|
ctx, cancel := context.WithTimeout(context.Background(), time.Duration(spec.Duration)*time.Second)
|
|
|
|
// Start bpftrace process
|
|
cmd := exec.CommandContext(ctx, "bpftrace", "-e", script)
|
|
|
|
// Create stdout pipe BEFORE starting
|
|
stdout, err := cmd.StdoutPipe()
|
|
if err != nil {
|
|
cancel()
|
|
return "", fmt.Errorf("failed to create stdout pipe: %w", err)
|
|
}
|
|
|
|
trace := &RunningTrace{
|
|
ID: traceID,
|
|
Spec: spec,
|
|
Process: cmd,
|
|
Events: []TraceEvent{},
|
|
StartTime: time.Now(),
|
|
Cancel: cancel,
|
|
Context: ctx,
|
|
Done: make(chan struct{}), // Initialize completion signal
|
|
}
|
|
|
|
// Start the trace
|
|
if err := cmd.Start(); err != nil {
|
|
cancel()
|
|
return "", fmt.Errorf("failed to start bpftrace: %w", err)
|
|
}
|
|
|
|
tm.traces[traceID] = trace
|
|
|
|
// Monitor the trace in a goroutine
|
|
go tm.monitorTrace(traceID, stdout)
|
|
|
|
logging.Debug("Started BCC-style trace %s for target %s", traceID, spec.Target)
|
|
return traceID, nil
|
|
} // generateBpftraceScript generates a bpftrace script based on the trace specification
|
|
func (tm *BCCTraceManager) generateBpftraceScript(spec TraceSpec) (string, error) {
|
|
var script strings.Builder
|
|
|
|
// Build probe specification
|
|
var probe string
|
|
switch spec.ProbeType {
|
|
case "p", "": // kprobe (default)
|
|
if strings.HasPrefix(spec.Target, "sys_") || strings.HasPrefix(spec.Target, "__x64_sys_") {
|
|
probe = fmt.Sprintf("kprobe:%s", spec.Target)
|
|
} else {
|
|
probe = fmt.Sprintf("kprobe:%s", spec.Target)
|
|
}
|
|
case "r": // kretprobe
|
|
if strings.HasPrefix(spec.Target, "sys_") || strings.HasPrefix(spec.Target, "__x64_sys_") {
|
|
probe = fmt.Sprintf("kretprobe:%s", spec.Target)
|
|
} else {
|
|
probe = fmt.Sprintf("kretprobe:%s", spec.Target)
|
|
}
|
|
case "t": // tracepoint
|
|
// If target already includes tracepoint prefix, use as-is
|
|
if strings.HasPrefix(spec.Target, "tracepoint:") {
|
|
probe = spec.Target
|
|
} else {
|
|
probe = fmt.Sprintf("tracepoint:%s", spec.Target)
|
|
}
|
|
case "u": // uprobe
|
|
if spec.Library == "" {
|
|
return "", fmt.Errorf("library required for uprobe")
|
|
}
|
|
probe = fmt.Sprintf("uprobe:%s:%s", spec.Library, spec.Target)
|
|
default:
|
|
return "", fmt.Errorf("unsupported probe type: %s", spec.ProbeType)
|
|
}
|
|
|
|
// Add BEGIN block
|
|
script.WriteString("BEGIN {\n")
|
|
script.WriteString(fmt.Sprintf(" printf(\"Starting trace for %s...\\n\");\n", spec.Target))
|
|
script.WriteString("}\n\n")
|
|
|
|
// Build the main probe
|
|
script.WriteString(fmt.Sprintf("%s {\n", probe))
|
|
|
|
// Add filters if specified
|
|
if tm.needsFiltering(spec) {
|
|
script.WriteString(" if (")
|
|
filters := tm.buildFilters(spec)
|
|
script.WriteString(strings.Join(filters, " && "))
|
|
script.WriteString(") {\n")
|
|
}
|
|
|
|
// Build output format
|
|
outputFormat := tm.buildOutputFormat(spec)
|
|
script.WriteString(fmt.Sprintf(" printf(\"%s\\n\"", outputFormat))
|
|
|
|
// Add arguments
|
|
args := tm.buildArgumentList(spec)
|
|
if len(args) > 0 {
|
|
script.WriteString(", ")
|
|
script.WriteString(strings.Join(args, ", "))
|
|
}
|
|
|
|
script.WriteString(");\n")
|
|
|
|
// Close filter if block
|
|
if tm.needsFiltering(spec) {
|
|
script.WriteString(" }\n")
|
|
}
|
|
|
|
script.WriteString("}\n\n")
|
|
|
|
// Add END block
|
|
script.WriteString("END {\n")
|
|
script.WriteString(fmt.Sprintf(" printf(\"Trace completed for %s\\n\");\n", spec.Target))
|
|
script.WriteString("}\n")
|
|
|
|
return script.String(), nil
|
|
}
|
|
|
|
// needsFiltering checks if any filters are needed
|
|
func (tm *BCCTraceManager) needsFiltering(spec TraceSpec) bool {
|
|
return spec.PID != 0 || spec.TID != 0 || spec.UID != -1 ||
|
|
spec.ProcessName != "" || spec.Filter != ""
|
|
}
|
|
|
|
// buildFilters builds the filter conditions
|
|
func (tm *BCCTraceManager) buildFilters(spec TraceSpec) []string {
|
|
var filters []string
|
|
|
|
if spec.PID != 0 {
|
|
filters = append(filters, fmt.Sprintf("pid == %d", spec.PID))
|
|
}
|
|
|
|
if spec.TID != 0 {
|
|
filters = append(filters, fmt.Sprintf("tid == %d", spec.TID))
|
|
}
|
|
|
|
if spec.UID != -1 {
|
|
filters = append(filters, fmt.Sprintf("uid == %d", spec.UID))
|
|
}
|
|
|
|
if spec.ProcessName != "" {
|
|
filters = append(filters, fmt.Sprintf("strncmp(comm, \"%s\", %d) == 0", spec.ProcessName, len(spec.ProcessName)))
|
|
}
|
|
|
|
// Add custom filter
|
|
if spec.Filter != "" {
|
|
// Convert common patterns to bpftrace syntax
|
|
customFilter := strings.ReplaceAll(spec.Filter, "arg", "arg")
|
|
filters = append(filters, customFilter)
|
|
}
|
|
|
|
return filters
|
|
}
|
|
|
|
// buildOutputFormat creates the output format string
|
|
func (tm *BCCTraceManager) buildOutputFormat(spec TraceSpec) string {
|
|
if spec.Format != "" {
|
|
// Use custom format
|
|
return fmt.Sprintf("TRACE|%%d|%%d|%%d|%%s|%s|%s", spec.Target, spec.Format)
|
|
}
|
|
|
|
// Default format
|
|
return fmt.Sprintf("TRACE|%%d|%%d|%%d|%%s|%s|called", spec.Target)
|
|
}
|
|
|
|
// buildArgumentList creates the argument list for printf
|
|
func (tm *BCCTraceManager) buildArgumentList(spec TraceSpec) []string {
|
|
// Always include timestamp, pid, tid, comm
|
|
args := []string{"nsecs", "pid", "tid", "comm"}
|
|
|
|
// Add custom arguments
|
|
for _, arg := range spec.Arguments {
|
|
switch arg {
|
|
case "arg1", "arg2", "arg3", "arg4", "arg5", "arg6":
|
|
args = append(args, fmt.Sprintf("arg%s", strings.TrimPrefix(arg, "arg")))
|
|
case "retval":
|
|
args = append(args, "retval")
|
|
case "cpu":
|
|
args = append(args, "cpu")
|
|
default:
|
|
// Custom expression
|
|
args = append(args, arg)
|
|
}
|
|
}
|
|
|
|
return args
|
|
}
|
|
|
|
// monitorTrace monitors a running trace and collects events
|
|
func (tm *BCCTraceManager) monitorTrace(traceID string, stdout io.ReadCloser) {
|
|
tm.tracesLock.Lock()
|
|
trace, exists := tm.traces[traceID]
|
|
if !exists {
|
|
tm.tracesLock.Unlock()
|
|
return
|
|
}
|
|
tm.tracesLock.Unlock()
|
|
|
|
// Start reading output in a goroutine
|
|
go func() {
|
|
scanner := NewEventScanner(stdout)
|
|
for scanner.Scan() {
|
|
event := scanner.Event()
|
|
if event != nil {
|
|
tm.tracesLock.Lock()
|
|
if t, exists := tm.traces[traceID]; exists {
|
|
t.Events = append(t.Events, *event)
|
|
}
|
|
tm.tracesLock.Unlock()
|
|
}
|
|
}
|
|
stdout.Close()
|
|
}()
|
|
|
|
// Wait for the process to complete
|
|
err := trace.Process.Wait()
|
|
|
|
// Clean up
|
|
trace.Cancel()
|
|
|
|
tm.tracesLock.Lock()
|
|
if err != nil && err.Error() != "signal: killed" {
|
|
logging.Warning("Trace %s completed with error: %v", traceID, err)
|
|
} else {
|
|
logging.Debug("Trace %s completed successfully with %d events",
|
|
traceID, len(trace.Events))
|
|
}
|
|
|
|
// Signal that monitoring is complete
|
|
close(trace.Done)
|
|
tm.tracesLock.Unlock()
|
|
}
|
|
|
|
// GetTraceResult returns the results of a completed trace
|
|
func (tm *BCCTraceManager) GetTraceResult(traceID string) (*TraceResult, error) {
|
|
tm.tracesLock.RLock()
|
|
trace, exists := tm.traces[traceID]
|
|
if !exists {
|
|
tm.tracesLock.RUnlock()
|
|
return nil, fmt.Errorf("trace %s not found", traceID)
|
|
}
|
|
tm.tracesLock.RUnlock()
|
|
|
|
// Wait for trace monitoring to complete
|
|
select {
|
|
case <-trace.Done:
|
|
// Trace monitoring completed
|
|
case <-time.After(5 * time.Second):
|
|
// Timeout waiting for completion
|
|
return nil, fmt.Errorf("timeout waiting for trace %s to complete", traceID)
|
|
}
|
|
|
|
// Now safely read the final results
|
|
tm.tracesLock.RLock()
|
|
defer tm.tracesLock.RUnlock()
|
|
|
|
result := &TraceResult{
|
|
TraceID: traceID,
|
|
Spec: trace.Spec,
|
|
Events: make([]TraceEvent, len(trace.Events)),
|
|
EventCount: len(trace.Events),
|
|
StartTime: trace.StartTime,
|
|
EndTime: time.Now(),
|
|
}
|
|
|
|
copy(result.Events, trace.Events)
|
|
|
|
// Calculate statistics
|
|
result.Statistics = tm.calculateStatistics(result.Events, result.EndTime.Sub(result.StartTime))
|
|
|
|
// Generate summary
|
|
result.Summary = tm.generateSummary(result)
|
|
|
|
return result, nil
|
|
}
|
|
|
|
// calculateStatistics calculates statistics for the trace results
|
|
func (tm *BCCTraceManager) calculateStatistics(events []TraceEvent, duration time.Duration) TraceStats {
|
|
stats := TraceStats{
|
|
TotalEvents: len(events),
|
|
EventsByProcess: make(map[string]int),
|
|
EventsByUID: make(map[int]int),
|
|
}
|
|
|
|
if duration > 0 {
|
|
stats.EventsPerSecond = float64(len(events)) / duration.Seconds()
|
|
}
|
|
|
|
// Calculate per-process and per-UID statistics
|
|
for _, event := range events {
|
|
stats.EventsByProcess[event.ProcessName]++
|
|
stats.EventsByUID[event.UID]++
|
|
}
|
|
|
|
// Calculate top processes
|
|
for processName, count := range stats.EventsByProcess {
|
|
percentage := float64(count) / float64(len(events)) * 100
|
|
stats.TopProcesses = append(stats.TopProcesses, ProcessStat{
|
|
ProcessName: processName,
|
|
EventCount: count,
|
|
Percentage: percentage,
|
|
})
|
|
}
|
|
|
|
return stats
|
|
}
|
|
|
|
// generateSummary generates a human-readable summary
|
|
func (tm *BCCTraceManager) generateSummary(result *TraceResult) string {
|
|
duration := result.EndTime.Sub(result.StartTime)
|
|
|
|
summary := fmt.Sprintf("Traced %s for %v, captured %d events (%.2f events/sec)",
|
|
result.Spec.Target, duration, result.EventCount, result.Statistics.EventsPerSecond)
|
|
|
|
if len(result.Statistics.TopProcesses) > 0 {
|
|
summary += fmt.Sprintf(", top process: %s (%d events)",
|
|
result.Statistics.TopProcesses[0].ProcessName,
|
|
result.Statistics.TopProcesses[0].EventCount)
|
|
}
|
|
|
|
return summary
|
|
}
|
|
|
|
// StopTrace stops an active trace
|
|
func (tm *BCCTraceManager) StopTrace(traceID string) error {
|
|
tm.tracesLock.Lock()
|
|
defer tm.tracesLock.Unlock()
|
|
|
|
trace, exists := tm.traces[traceID]
|
|
if !exists {
|
|
return fmt.Errorf("trace %s not found", traceID)
|
|
}
|
|
|
|
if trace.Process.ProcessState == nil {
|
|
// Process is still running, kill it
|
|
if err := trace.Process.Process.Kill(); err != nil {
|
|
return fmt.Errorf("failed to stop trace: %w", err)
|
|
}
|
|
}
|
|
|
|
trace.Cancel()
|
|
return nil
|
|
}
|
|
|
|
// ListActiveTraces returns a list of active trace IDs
|
|
func (tm *BCCTraceManager) ListActiveTraces() []string {
|
|
tm.tracesLock.RLock()
|
|
defer tm.tracesLock.RUnlock()
|
|
|
|
var active []string
|
|
for id, trace := range tm.traces {
|
|
if trace.Process.ProcessState == nil {
|
|
active = append(active, id)
|
|
}
|
|
}
|
|
|
|
return active
|
|
}
|
|
|
|
// GetSummary returns a summary of the trace manager state
|
|
func (tm *BCCTraceManager) GetSummary() map[string]interface{} {
|
|
tm.tracesLock.RLock()
|
|
defer tm.tracesLock.RUnlock()
|
|
|
|
activeCount := 0
|
|
completedCount := 0
|
|
|
|
for _, trace := range tm.traces {
|
|
if trace.Process.ProcessState == nil {
|
|
activeCount++
|
|
} else {
|
|
completedCount++
|
|
}
|
|
}
|
|
|
|
return map[string]interface{}{
|
|
"capabilities": tm.capabilities,
|
|
"active_traces": activeCount,
|
|
"completed_traces": completedCount,
|
|
"total_traces": len(tm.traces),
|
|
"active_trace_ids": tm.ListActiveTraces(),
|
|
}
|
|
}
|