diff --git a/libbpf-tools/.gitignore b/libbpf-tools/.gitignore index 14e91ec45ad0..0fb49166bf14 100644 --- a/libbpf-tools/.gitignore +++ b/libbpf-tools/.gitignore @@ -62,6 +62,7 @@ /tcplife /tcppktlat /tcptracer +/tcpretrans /tcprtt /tcpstates /tcpsynbl diff --git a/libbpf-tools/Makefile b/libbpf-tools/Makefile index a442da985a6e..39b0f393038b 100644 --- a/libbpf-tools/Makefile +++ b/libbpf-tools/Makefile @@ -88,6 +88,7 @@ APPS = \ tcpconnlat \ tcplife \ tcppktlat \ + tcpretrans \ tcprtt \ tcpstates \ tcpsynbl \ diff --git a/libbpf-tools/tcpretrans.bpf.c b/libbpf-tools/tcpretrans.bpf.c new file mode 100644 index 000000000000..d0e82840cf0b --- /dev/null +++ b/libbpf-tools/tcpretrans.bpf.c @@ -0,0 +1,148 @@ +// SPDX-License-Identifier: GPL-2.0 + +/* + * tcpretrans Trace IPv4 and IPv6 tcp retransmit events + * + * Copyright (c) 2020 Anton Protopopov + * Copyright (c) 2021 Red Hat, Inc. + * + * Based on tcpconnect.c by Anton Protopopov and + * tcpretrans(8) from BCC by Brendan Gregg + * 15-Jul-2021 Michael Gugino Created this. + */ +#include + +#include +#include +#include + +#include "maps.bpf.h" +#include "tcpretrans.h" + +/* Define here, because there are conflicts with include files */ +#define AF_INET 2 +#define AF_INET6 10 + +const volatile bool do_count = false; + +struct { + __uint(type, BPF_MAP_TYPE_PERF_EVENT_ARRAY); + __uint(key_size, sizeof(u32)); + __uint(value_size, sizeof(u32)); +} events SEC(".maps"); + +struct { + __uint(type, BPF_MAP_TYPE_HASH); + __uint(max_entries, MAX_ENTRIES); + __type(key, struct ipv4_flow_key); + __type(value, u64); + __uint(map_flags, BPF_F_NO_PREALLOC); +} ipv4_count SEC(".maps"); + +struct { + __uint(type, BPF_MAP_TYPE_HASH); + __uint(max_entries, MAX_ENTRIES); + __type(key, struct ipv6_flow_key); + __type(value, u64); + __uint(map_flags, BPF_F_NO_PREALLOC); +} ipv6_count SEC(".maps"); + +static void count_v4(const struct sock *skp) +{ + struct ipv4_flow_key key; + static const __u64 zero; + __u64 *val; + + BPF_CORE_READ_INTO(&key.saddr, skp, __sk_common.skc_rcv_saddr); + BPF_CORE_READ_INTO(&key.daddr, skp, __sk_common.skc_daddr); + BPF_CORE_READ_INTO(&key.dport, skp, __sk_common.skc_dport); + BPF_CORE_READ_INTO(&key.sport, skp, __sk_common.skc_num); + val = bpf_map_lookup_or_try_init(&ipv4_count, &key, &zero); + if (val) + __atomic_add_fetch(val, 1, __ATOMIC_RELAXED); +} + +static void count_v6(const struct sock *skp) +{ + struct ipv6_flow_key key; + static const __u64 zero; + __u64 *val; + + BPF_CORE_READ_INTO(&key.saddr, skp, + __sk_common.skc_v6_rcv_saddr.in6_u.u6_addr32); + BPF_CORE_READ_INTO(&key.daddr, skp, + __sk_common.skc_v6_daddr.in6_u.u6_addr32); + BPF_CORE_READ_INTO(&key.dport, skp, __sk_common.skc_dport); + BPF_CORE_READ_INTO(&key.sport, skp, __sk_common.skc_num); + + val = bpf_map_lookup_or_try_init(&ipv6_count, &key, &zero); + if (val) + __atomic_add_fetch(val, 1, __ATOMIC_RELAXED); +} + +static int trace_event(void *ctx, const struct sock *skp, int type) +{ + struct event e = {}; + __u32 family; + __u64 pid_tgid; + __u32 pid; + int state; + + if (skp == NULL) + return 0; + + family = BPF_CORE_READ(skp, __sk_common.skc_family); + e.af = family; + + if (do_count) { + if (family == AF_INET) + count_v4(skp); + else + count_v6(skp); + return 0; + } + + e.type = type; + pid_tgid = bpf_get_current_pid_tgid(); + pid = pid_tgid >> 32; + e.pid = pid; + + BPF_CORE_READ_INTO(&e.dport, skp, __sk_common.skc_dport); + BPF_CORE_READ_INTO(&e.sport, skp, __sk_common.skc_num); + e.state = BPF_CORE_READ(skp, __sk_common.skc_state); + + if (family == AF_INET) { + BPF_CORE_READ_INTO(&e.saddr, skp, __sk_common.skc_rcv_saddr); + BPF_CORE_READ_INTO(&e.daddr, skp, __sk_common.skc_daddr); + } else if (family == AF_INET6) { + BPF_CORE_READ_INTO(&e.saddr, skp, + __sk_common.skc_v6_rcv_saddr.in6_u.u6_addr32); + BPF_CORE_READ_INTO(&e.daddr, skp, + __sk_common.skc_v6_daddr.in6_u.u6_addr32); + } + bpf_perf_event_output(ctx, &events, BPF_F_CURRENT_CPU, &e, sizeof(e)); + return 0; +} + +SEC("tp/tcp/tcp_retransmit_skb") +int tp_tcp_retransmit_skb(struct trace_event_raw_tcp_event_sk_skb *ctx) +{ + const struct sock *skp; + + skp = BPF_CORE_READ(ctx, skaddr); + return trace_event(ctx, skp, RETRANSMIT); +} + +SEC("kprobe/tcp_send_loss_probe") +int BPF_KPROBE(tcp_send_loss_probe, struct sock *sk) +{ + return trace_event(ctx, sk, TLP); +} + +SEC("kprobe/tcp_retransmit_skb") +int BPF_KPROBE(tcp_retransmit_skb, struct sock *sk) +{ + return trace_event(ctx, sk, RETRANSMIT); +} + +char LICENSE[] SEC("license") = "GPL"; \ No newline at end of file diff --git a/libbpf-tools/tcpretrans.c b/libbpf-tools/tcpretrans.c new file mode 100644 index 000000000000..d37f77f5f412 --- /dev/null +++ b/libbpf-tools/tcpretrans.c @@ -0,0 +1,358 @@ +// SPDX-License-Identifier: GPL-2.0 + +/* + * tcpretrans Trace IPv4 and IPv6 tcp retransmit events + * + * Copyright (c) 2020 Anton Protopopov + * Copyright (c) 2021 Red Hat, Inc. + * + * Based on tcpconnect.c by Anton Protopopov and + * tcpretrans(8) from BCC by Brendan Gregg + * 15-Jul-2021 Michael Gugino Created this. + */ +#include +#include +#include +#include +#include +#include +#include +#include +#include "tcpretrans.h" +#include "tcpretrans.skel.h" +#include "trace_helpers.h" +#include "map_helpers.h" + +#define warn(...) fprintf(stderr, __VA_ARGS__) + +const char *argp_program_version = "tcpretrans 0.1"; +const char *argp_program_bug_address = + "https://github.com/iovisor/bcc/tree/master/libbpf-tools"; +static const char argp_program_doc[] = + "\ntcpretrans: Trace TCP retransmits\n" + "\n" + "EXAMPLES:\n" + " tcpretrans # display all TCP retransmissions\n" + " tcpretrans -c # count occurred retransmits per flow\n" + " tcpretrans -l # include tail loss probe attempts\n" + ; + +static const char *tppath = "/sys/kernel/debug/tracing/events/tcp/tcp_retransmit_skb/id"; + +static const char *TCPSTATE[] = { + "", + "ESTABLISHED", + "SYN_SENT", + "SYN_RECV", + "FIN_WAIT1", + "FIN_WAIT2", + "TIME_WAIT", + "CLOSE", + "CLOSE_WAIT", + "LAST_ACK", + "LISTEN", + "CLOSING", + "NEW_SYN_RECV" +}; + +static volatile sig_atomic_t exiting = 0; + +static void sig_handler(int sig) +{ + exiting = true; +} + +static const struct argp_option opts[] = { + { "verbose", 'v', NULL, 0, "Verbose debug output", 0 }, + { "count", 'c', NULL, 0, "Count connects per src ip and dst ip/port", 0 }, + { "lossprobe", 'l', NULL, 0, "include tail loss probe attempts", 0 }, + { "kprobe", 'k', NULL, 0, "force kprobe instead of tracepoint", 0 }, + { NULL, 'h', NULL, OPTION_HIDDEN, "Show the full help", 0 }, + {}, +}; + +static struct env { + bool verbose; + bool count; + bool lossprobe; + bool kprobe; +} env = {}; + +static error_t parse_arg(int key, char *arg, struct argp_state *state) +{ + switch (key) { + case 'h': + argp_state_help(state, stderr, ARGP_HELP_STD_HELP); + break; + case 'v': + env.verbose = true; + break; + case 'c': + env.count = true; + break; + case 'l': + env.lossprobe = true; + break; + case 'k': + env.kprobe = true; + break; + default: + return ARGP_ERR_UNKNOWN; + } + return 0; +} + +static int libbpf_print_fn(enum libbpf_print_level level, + const char *format, va_list args) +{ + if (level == LIBBPF_DEBUG && !env.verbose) + return 0; + return vfprintf(stderr, format, args); +} + +static void print_count_ipv4(int map_fd) +{ + static struct ipv4_flow_key keys[MAX_ENTRIES]; + __u32 value_size = sizeof(__u64); + __u32 key_size = sizeof(keys[0]); + static struct ipv4_flow_key zero; + static __u64 counts[MAX_ENTRIES]; + char s[INET_ADDRSTRLEN]; + char d[INET_ADDRSTRLEN]; + __u32 i, n = MAX_ENTRIES; + struct in_addr src; + struct in_addr dst; + + if (dump_hash(map_fd, keys, key_size, counts, value_size, &n, &zero, false)) { + warn("dump_hash: %s", strerror(errno)); + return; + } + + for (i = 0; i < n; i++) { + src.s_addr = keys[i].saddr; + dst.s_addr = keys[i].daddr; + + printf("%-25s %-25s", + inet_ntop(AF_INET, &src, s, sizeof(s)), + inet_ntop(AF_INET, &dst, d, sizeof(d))); + //if (env.source_port) + // printf(" %-20d", keys[i].sport); + printf(" %-20d", ntohs(keys[i].dport)); + printf(" %-10llu", counts[i]); + printf("\n"); + } +} + +static void print_count_ipv6(int map_fd) +{ + static struct ipv6_flow_key keys[MAX_ENTRIES]; + __u32 value_size = sizeof(__u64); + __u32 key_size = sizeof(keys[0]); + static struct ipv6_flow_key zero; + static __u64 counts[MAX_ENTRIES]; + char s[INET6_ADDRSTRLEN]; + char d[INET6_ADDRSTRLEN]; + __u32 i, n = MAX_ENTRIES; + struct in6_addr src; + struct in6_addr dst; + + if (dump_hash(map_fd, keys, key_size, counts, value_size, &n, &zero, false)) { + warn("dump_hash: %s", strerror(errno)); + return; + } + + for (i = 0; i < n; i++) { + memcpy(src.s6_addr, keys[i].saddr, sizeof(src.s6_addr)); + memcpy(dst.s6_addr, keys[i].daddr, sizeof(src.s6_addr)); + + printf("%-25s %-25s", + inet_ntop(AF_INET6, &src, s, sizeof(s)), + inet_ntop(AF_INET6, &dst, d, sizeof(d))); + //if (env.source_port) + // printf(" %-20d", keys[i].sport); + printf(" %-20d", ntohs(keys[i].dport)); + printf(" %-10llu", counts[i]); + printf("\n"); + } +} + +static void print_count(int map_fd_ipv4, int map_fd_ipv6) +{ + while (!exiting) + pause(); + + printf("\n%-25s %-25s %-10s\n", "LADDR:LPORT", "RADDR:RPORT", "RETRANSMITS"); + print_count_ipv4(map_fd_ipv4); + print_count_ipv6(map_fd_ipv6); +} + +static void print_events_header() +{ + printf("%-8s %-6s %-2s %-20s %1s> %-20s %-4s\n", "TIME", "PID", "IP", + "LADDR:LPORT", "T", "RADDR:RPORT", "STATE"); +} + +static void handle_event(void *ctx, int cpu, void *data, __u32 data_sz) +{ + const struct event *e = data; + struct tm *tm; + char ts[32]; + time_t t; + char src[INET6_ADDRSTRLEN]; + char dst[INET6_ADDRSTRLEN]; + char remote[INET6_ADDRSTRLEN + 6]; + char local[INET6_ADDRSTRLEN + 6]; + union { + struct in_addr x4; + struct in6_addr x6; + } s, d; + __u16 dport; + + if (e->af == AF_INET) { + memcpy(&s.x4.s_addr, e->saddr, sizeof(s.x4.s_addr)); + memcpy(&d.x4.s_addr, e->daddr, sizeof(d.x4.s_addr)); + } else if (e->af == AF_INET6) { + memcpy(&s.x6.s6_addr, e->saddr, sizeof(s.x6.s6_addr)); + memcpy(&d.x6.s6_addr, e->daddr, sizeof(d.x6.s6_addr)); + } else { + warn("broken event: event->af=%d", e->af); + return; + } + + time(&t); + tm = localtime(&t); + dport = ntohs(e->dport); + strftime(ts, sizeof(ts), "%H:%M:%S", tm); + sprintf(local, "%s:%d", inet_ntop(e->af, &s, src, sizeof(src)), e->sport); + sprintf(remote, "%s:%d", inet_ntop(e->af, &d, dst, sizeof(dst)), dport); + + printf("%-8s %-6d %-2d %-20s %1s> %-20s %s\n", + ts, + e->pid, + e->af == AF_INET ? 4 : 6, + local, + e->type == RETRANSMIT ? "R" : "L", + remote, + TCPSTATE[e->state]); +} + +static void handle_lost_events(void *ctx, int cpu, __u64 lost_cnt) +{ + warn("Lost %llu events on CPU #%d!\n", lost_cnt, cpu); +} + +static void print_events(int perf_map_fd) +{ + struct perf_buffer *pb; + int err; + + pb = perf_buffer__new(perf_map_fd, 128, + handle_event, handle_lost_events, NULL, NULL); + if (!pb) { + err = -errno; + warn("failed to open perf buffer: %d\n", err); + goto cleanup; + } + + print_events_header(); + while (!exiting) { + err = perf_buffer__poll(pb, 100); + if (err < 0 && err != -EINTR) { + warn("error polling perf buffer: %s\n", strerror(-err)); + goto cleanup; + } + /* reset err to return 0 if exiting */ + err = 0; + } + +cleanup: + perf_buffer__free(pb); +} + +int main(int argc, char **argv) +{ + static const struct argp argp = { + .options = opts, + .parser = parse_arg, + .doc = argp_program_doc, + .args_doc = NULL, + }; + + struct tcpretrans_bpf *obj; + int err, tpmissing; + + err = argp_parse(&argp, argc, argv, 0, NULL, NULL); + if (err) + return err; + + libbpf_set_print(libbpf_print_fn); + + obj = tcpretrans_bpf__open(); + if (!obj) { + warn("failed to open BPF object\n"); + return 1; + } + + /* + * bpf will load non-existant trace points but fail at the attach stage, so + * check to ensure our tp exists before we load it. + */ + tpmissing = access(tppath, F_OK); + + if (tpmissing || env.kprobe) { + if (!env.kprobe) + warn("tcp_retransmit_skb tracepoint not found, falling back to kprobe"); + err = bpf_program__set_autoload(obj->progs.tp_tcp_retransmit_skb, false); + if (err) { + warn("Unable to set autoload for tp_tcp_retransmit_skb\n"); + return err; + } + } else { + err = bpf_program__set_autoload(obj->progs.tcp_retransmit_skb, false); + if (err) { + warn("Unable to set autoload for tcp_retransmit_skb\n"); + return err; + } + } + + if (!env.lossprobe) { + err = bpf_program__set_autoload(obj->progs.tcp_send_loss_probe, false); + if (err) { + warn("Unable to set autoload for tcp_send_loss_probe\n"); + return err; + } + } + + if (env.count) + obj->rodata->do_count = true; + + err = tcpretrans_bpf__load(obj); + if (err) { + warn("failed to load BPF object: %d\n", err); + goto cleanup; + } + + err = tcpretrans_bpf__attach(obj); + if (err) { + warn("failed to attach BPF programs: %s\n", strerror(-err)); + goto cleanup; + } + + if (signal(SIGINT, sig_handler) == SIG_ERR || signal(SIGTERM, sig_handler) == SIG_ERR) { + warn("can't set signal handler: %s\n", strerror(errno)); + err = 1; + goto cleanup; + } + printf("Tracing retransmits ... Hit Ctrl-C to end\n"); + if (env.count) { + print_count(bpf_map__fd(obj->maps.ipv4_count), + bpf_map__fd(obj->maps.ipv6_count)); + } else { + print_events(bpf_map__fd(obj->maps.events)); + } + +cleanup: + tcpretrans_bpf__destroy(obj); + + return err != 0; +} \ No newline at end of file diff --git a/libbpf-tools/tcpretrans.h b/libbpf-tools/tcpretrans.h new file mode 100644 index 000000000000..e003e626a17a --- /dev/null +++ b/libbpf-tools/tcpretrans.h @@ -0,0 +1,40 @@ +/* SPDX-License-Identifier: GPL-2.0 */ + +/* + * Copyright (c) 2020 Anton Protopopov + * Copyright (c) 2021 Red Hat, Inc. + */ +#ifndef __TCPRETRANS_H +#define __TCPRETRANS_H + +#define MAX_ENTRIES 8192 + +#define RETRANSMIT 1 +#define TLP 2 + +struct event { + int type; + int state; + __u8 saddr[16]; + __u8 daddr[16]; + __u32 af; // AF_INET or AF_INET6 + __u32 pid; + __u16 dport; + __u16 sport; +}; + +struct ipv4_flow_key { + __u32 saddr; + __u32 daddr; + __u16 dport; + __u16 sport; +}; + +struct ipv6_flow_key { + __u8 saddr[16]; + __u8 daddr[16]; + __u16 dport; + __u16 sport; +}; + +#endif /* __TCPRETRANS_H */ \ No newline at end of file