diff --git a/docs/graph.svg b/docs/graph.svg index dd059f6f..8e1152b1 100644 --- a/docs/graph.svg +++ b/docs/graph.svg @@ -4,850 +4,880 @@ - - + + grout - + control_input - -control_input + +control_input loopback_input - -loopback_input + +loopback_input control_input->loopback_input - - + + arp_output_request - -arp_output_request + +arp_output_request control_input->arp_output_request - - + + icmp_local_send - -icmp_local_send + +icmp_local_send control_input->icmp_local_send - - + + icmp6_local_send - -icmp6_local_send + +icmp6_local_send control_input->icmp6_local_send - - + + - + -ndp_ns_output - -ndp_ns_output +ndp_na_output + +ndp_na_output - + -control_input->ndp_ns_output - - +control_input->ndp_na_output + + - + -ip6_output - -ip6_output +ndp_ns_output + +ndp_ns_output - + -control_input->ip6_output - - +control_input->ndp_ns_output + + - + -ndp_na_output - -ndp_na_output +ip6_output + +ip6_output - + -control_input->ndp_na_output - - +control_input->ip6_output + + ip_output - -ip_output + +ip_output control_input->ip_output - - + + arp_output_reply - -arp_output_reply + +arp_output_reply control_input->arp_output_reply - - + + eth_input - -eth_input + +eth_input loopback_input->eth_input - - + + eth_output - -eth_output + +eth_output arp_output_request->eth_output - - + + icmp_output - -icmp_output + +icmp_output icmp_local_send->icmp_output - - + + - + icmp6_output - -icmp6_output + +icmp6_output - + icmp6_local_send->icmp6_output - - + + + + + +ndp_na_output->icmp6_output + + - + ndp_ns_output->icmp6_output - - + + - + ip6_output->eth_output - - + + loop_xvrf - -loop_xvrf + +loop_xvrf - + ip6_output->loop_xvrf - - + + - + sr6_output - -sr6_output + +sr6_output - + ip6_output->sr6_output - - + + - + ip6_hold - -ip6_hold + +ip6_hold - + ip6_output->ip6_hold - - + + - + ip6_error_dest_unreach - -ip6_error_dest_unreach + +ip6_error_dest_unreach - + ip6_output->ip6_error_dest_unreach - - - - - -ndp_na_output->icmp6_output - - + + - + ip_output->eth_output - - + + - + ip_output->loop_xvrf - - + + - + +ip_fragment + +ip_fragment + + + +ip_output->ip_fragment + + + + + ip_hold - -ip_hold + +ip_hold - + ip_output->ip_hold - - + + - + ip_error_dest_unreach - -ip_error_dest_unreach + +ip_error_dest_unreach - + ip_output->ip_error_dest_unreach - - + + + + + +ip_error_frag_needed + +ip_error_frag_needed + + + +ip_output->ip_error_frag_needed + + - + ipip_output - -ipip_output + +ipip_output - + ip_output->ipip_output - - + + - + ip_output->sr6_output - - + + arp_output_reply->eth_output - - + + control_output - -control_output + +control_output arp_input - -arp_input + +arp_input eth_input->arp_input - - + + ip_input - -ip_input + +ip_input eth_input->ip_input - - + + ip6_input - -ip6_input + +ip6_input eth_input->ip6_input - - + + arp_input_request - -arp_input_request + +arp_input_request arp_input->arp_input_request - - + + arp_input_reply - -arp_input_reply + +arp_input_reply arp_input->arp_input_reply - - + + - + ip_input->ip_output - - + + ip_forward - -ip_forward + +ip_forward - + ip_input->ip_forward - - + + - + dnat44_dynamic - -dnat44_dynamic + +dnat44_dynamic - + ip_input->dnat44_dynamic - - + + - + ip_input_local - -ip_input_local + +ip_input_local - + ip_input->ip_input_local - - + + - + ip_input->ip_error_dest_unreach - - + + - + dnat44_static - -dnat44_static + +dnat44_static - + ip_input->dnat44_static - - + + - + ip6_input->ip6_output - - + + - + ip6_forward - -ip6_forward + +ip6_forward - + ip6_input->ip6_forward - - + + - + ip6_input_local - -ip6_input_local + +ip6_input_local - + ip6_input->ip6_input_local - - + + - + ip6_input->ip6_error_dest_unreach - - + + - + sr6_local - -sr6_local + +sr6_local - + ip6_input->sr6_local - - + + port_output - -port_output + +port_output eth_output->port_output - - + + port_tx - -port_tx + +port_tx port_output->port_tx - - + + l1_xconnect - -l1_xconnect + +l1_xconnect l1_xconnect->port_output - - + + loopback_output - -loopback_output + +loopback_output loopback_output->control_output - - + + loop_xvrf->ip_input - - + + loop_xvrf->ip6_input - - + + port_rx - -port_rx + +port_rx port_rx->eth_input - - + + port_rx->l1_xconnect - - + + arp_input_request->control_output - - + + arp_input_reply->control_output - - + + icmp_input - -icmp_input + +icmp_input icmp_input->control_output - - + + icmp_input->icmp_output - - + + icmp_output->ip_output - - + + ip_forward->ip_output - - + + ip_error_ttl_exceeded - -ip_error_ttl_exceeded + +ip_error_ttl_exceeded ip_forward->ip_error_ttl_exceeded - - + + - + +ip_fragment->ip_output + + + + + ip_hold->control_output - - + + - + dnat44_dynamic->ip_forward - - + + - + dnat44_dynamic->ip_input_local - - + + - + dnat44_dynamic->ip_error_dest_unreach - - + + - + ip_input_local->icmp_input - - + + - + ipip_input - -ipip_input + +ipip_input - + ip_input_local->ipip_input - - + + - + l4_input_local - -l4_input_local + +l4_input_local - + ip_input_local->l4_input_local - - + + - + dnat44_static->ip_forward - - + + - + dnat44_static->ip_input_local - - + + - + dnat44_static->ip_error_dest_unreach - - + + - + ipip_input->ip_input - - + + - + l4_loopback_output - -l4_loopback_output + +l4_loopback_output - + l4_input_local->l4_loopback_output - - + + - + ipip_output->ip_output - - + + - + sr6_output->ip6_output - - + + - + icmp6_input - -icmp6_input + +icmp6_input - + icmp6_input->control_output - - + + - + icmp6_input->icmp6_output - - + + - + ndp_ns_input - -ndp_ns_input + +ndp_ns_input - + icmp6_input->ndp_ns_input - - + + - + ndp_na_input - -ndp_na_input + +ndp_na_input - + icmp6_input->ndp_na_input - - + + - + ndp_rs_input - -ndp_rs_input + +ndp_rs_input - + icmp6_input->ndp_rs_input - - + + - + icmp6_output->ip6_output - - + + - + ndp_ns_input->control_output - - + + - + ndp_na_input->control_output - - + + - + ndp_rs_input->control_output - - + + - + ip6_forward->ip6_output - - + + - + ip6_error_ttl_exceeded - -ip6_error_ttl_exceeded + +ip6_error_ttl_exceeded - + ip6_forward->ip6_error_ttl_exceeded - - + + - + ip6_hold->control_output - - + + - + ip6_input_local->l4_input_local - - + + - + ip6_input_local->icmp6_input - - + + - + sr6_local->ip_input - - + + - + sr6_local->ip6_input - - + + - + sr6_local->ip6_input_local - - + + - + sr6_local->ip6_error_dest_unreach - - + + - + l4_loopback_output->loopback_output - - + + diff --git a/modules/ip/datapath/ip_error.c b/modules/ip/datapath/ip_error.c index 4057daef..fd9c79f1 100644 --- a/modules/ip/datapath/ip_error.c +++ b/modules/ip/datapath/ip_error.c @@ -110,6 +110,13 @@ static int no_route_init(const struct rte_graph *, struct rte_node *node) { return 0; } +static int frag_needed_init(const struct rte_graph *, struct rte_node *node) { + struct ip_error_ctx *ctx = ip_error_ctx(node); + ctx->icmp_type = RTE_ICMP_TYPE_DEST_UNREACHABLE; + ctx->icmp_code = RTE_ICMP_CODE_UNREACH_FRAG; + return 0; +} + static struct rte_node_register ip_forward_ttl_exceeded_node = { .name = "ip_error_ttl_exceeded", .process = ip_error_process, @@ -134,6 +141,18 @@ static struct rte_node_register no_route_node = { .init = no_route_init, }; +static struct rte_node_register frag_needed_node = { + .name = "ip_error_frag_needed", + .process = ip_error_process, + .nb_edges = EDGE_COUNT, + .next_nodes = { + [ICMP_OUTPUT] = "icmp_output", + [NO_HEADROOM] = "error_no_headroom", + [NO_IP] = "error_no_local_ip", + }, + .init = frag_needed_init, +}; + static struct gr_node_info info_ttl_exceeded = { .node = &ip_forward_ttl_exceeded_node, }; @@ -142,7 +161,12 @@ static struct gr_node_info info_no_route = { .node = &no_route_node, }; +static struct gr_node_info info_frag_needed = { + .node = &frag_needed_node, +}; + GR_NODE_REGISTER(info_ttl_exceeded); GR_NODE_REGISTER(info_no_route); +GR_NODE_REGISTER(info_frag_needed); GR_DROP_REGISTER(error_no_local_ip); diff --git a/modules/ip/datapath/ip_fragment.c b/modules/ip/datapath/ip_fragment.c new file mode 100644 index 00000000..8e5ee1be --- /dev/null +++ b/modules/ip/datapath/ip_fragment.c @@ -0,0 +1,186 @@ +// SPDX-License-Identifier: BSD-3-Clause +// Copyright (c) 2025 Anthony Harivel + +#include +#include +#include +#include +#include +#include +#include + +#include +#include +#include + +#include +#include +#include + +struct ip_fragment_trace_data { + uint16_t packet_id; + uint16_t frag_num; + uint16_t offset; + uint8_t more_frags; +}; + +enum { + IP_OUTPUT = 0, + NO_MBUF, + ALREADY_FRAGMENTED, + ERROR, + EDGE_COUNT, +}; + +static uint16_t +ip_fragment_process(struct rte_graph *graph, struct rte_node *node, void **objs, uint16_t nb_objs) { + struct rte_mbuf *mbuf, *frag_mbuf; + struct rte_ipv4_hdr *ip, *frag_ip; + uint16_t frag_size, frag_data_len; + uint16_t data_len, offset; + const struct iface *iface; + uint16_t num_frags, i; + uint16_t ip_hdr_len; + uint16_t sent = 0; + rte_edge_t edge; + void *payload; + + for (uint16_t j = 0; j < nb_objs; j++) { + mbuf = objs[j]; + ip = rte_pktmbuf_mtod(mbuf, struct rte_ipv4_hdr *); + + // Check if packet is already a fragment - if so, just pass it through + if (ip->fragment_offset + & RTE_BE16(RTE_IPV4_HDR_MF_FLAG | RTE_IPV4_HDR_OFFSET_MASK)) { + // This is already a fragment, drop it + edge = ALREADY_FRAGMENTED; + goto drop; + } + + iface = mbuf_data(mbuf)->iface; + assert(iface != NULL); + + ip_hdr_len = rte_ipv4_hdr_len(ip); + data_len = rte_be_to_cpu_16(ip->total_length) - ip_hdr_len; + + // Calculate fragment payload size (multiple of 8, >= 8) + uint16_t max_payload = (uint16_t)(iface->mtu - ip_hdr_len); + frag_size = RTE_ALIGN_FLOOR(max_payload, 8); + if (unlikely(frag_size < 8)) { + edge = ERROR; + goto drop; + } + + num_frags = (data_len + frag_size - 1) / frag_size; + assert(num_frags > 1); + + // Prepare and enqueue first fragment (using original mbuf) + ip->total_length = rte_cpu_to_be_16(ip_hdr_len + frag_size); + ip->fragment_offset = RTE_BE16(RTE_IPV4_HDR_MF_FLAG); + ip->hdr_checksum = 0; + ip->hdr_checksum = rte_ipv4_cksum(ip); + + if (gr_mbuf_is_traced(mbuf)) { + struct ip_fragment_trace_data *t; + t = gr_mbuf_trace_add(mbuf, node, sizeof(*t)); + t->packet_id = rte_be_to_cpu_16(ip->packet_id); + t->frag_num = 0; + t->offset = 0; + t->more_frags = 1; + } + + // Enqueue first fragment + rte_node_enqueue_x1(graph, node, IP_OUTPUT, mbuf); + sent++; + + // Create and enqueue remaining fragments + for (i = 1; i < num_frags; i++) { + // Create new fragment, copying the original IPv4 header. + frag_mbuf = rte_pktmbuf_copy(mbuf, mbuf->pool, 0, ip_hdr_len); + if (unlikely(frag_mbuf == NULL)) { + break; + } + + frag_ip = rte_pktmbuf_mtod(frag_mbuf, struct rte_ipv4_hdr *); + offset = i * frag_size; + frag_data_len = RTE_MIN(frag_size, data_len - offset); + + payload = rte_pktmbuf_append(frag_mbuf, frag_data_len); + if (unlikely(payload == NULL)) { + rte_pktmbuf_free(frag_mbuf); + break; + } + + memcpy(payload, + rte_pktmbuf_mtod_offset(mbuf, const void *, ip_hdr_len + offset), + frag_data_len); + + frag_ip->total_length = rte_cpu_to_be_16(ip_hdr_len + frag_data_len); + frag_ip->fragment_offset = rte_cpu_to_be_16( + (offset / 8) | ((i < num_frags - 1) ? RTE_IPV4_HDR_MF_FLAG : 0) + ); + frag_ip->hdr_checksum = 0; + frag_ip->hdr_checksum = rte_ipv4_cksum(frag_ip); + + *ip_output_mbuf_data(frag_mbuf) = *ip_output_mbuf_data(mbuf); + frag_mbuf->packet_type = mbuf->packet_type; + if (gr_mbuf_is_traced(mbuf)) { + struct ip_fragment_trace_data *t; + t = gr_mbuf_trace_add(frag_mbuf, node, sizeof(*t)); + t->packet_id = rte_be_to_cpu_16(frag_ip->packet_id); + t->frag_num = i; + t->offset = offset; + t->more_frags = (i < num_frags - 1) ? 1 : 0; + } + + rte_node_enqueue_x1(graph, node, IP_OUTPUT, frag_mbuf); + sent++; + } + + // Trim first fragment to the right size + rte_pktmbuf_trim(mbuf, data_len - frag_size); + + continue; + +drop: + rte_node_enqueue_x1(graph, node, edge, mbuf); + sent++; + } + + return sent; +} + +static int ip_fragment_trace_format(char *buf, size_t len, const void *data, size_t /*data_len*/) { + const struct ip_fragment_trace_data *t = data; + return snprintf( + buf, + len, + "id=%u frag=%u offset=%u%s", + t->packet_id, + t->frag_num, + t->offset, + t->more_frags ? " MF" : "" + ); +} + +static struct rte_node_register fragment_node = { + .name = "ip_fragment", + .process = ip_fragment_process, + .nb_edges = EDGE_COUNT, + .next_nodes = { + [IP_OUTPUT] = "ip_output", + [NO_MBUF] = "error_no_headroom", + [ALREADY_FRAGMENTED] = "ip_fragment_already_fragmented", + [ERROR] = "ip_fragment_error" + }, +}; + +static struct gr_node_info info = { + .node = &fragment_node, + .trace_format = ip_fragment_trace_format, +}; + +GR_NODE_REGISTER(info); + +GR_DROP_REGISTER(ip_fragment_error); +GR_DROP_REGISTER(ip_fragment_already_fragmented); diff --git a/modules/ip/datapath/ip_output.c b/modules/ip/datapath/ip_output.c index 5f2eb316..36e06f50 100644 --- a/modules/ip/datapath/ip_output.c +++ b/modules/ip/datapath/ip_output.c @@ -26,7 +26,8 @@ enum { HOLD, NO_ROUTE, ERROR, - TOO_BIG, + FRAGMENT, + FRAG_NEEDED, DROP, EDGE_COUNT, }; @@ -88,15 +89,20 @@ ip_output_process(struct rte_graph *graph, struct rte_node *node, void **objs, u goto next; } + mbuf_data(mbuf)->iface = iface; + if (rte_pktmbuf_pkt_len(mbuf) > iface->mtu) { - edge = TOO_BIG; + if (ip->fragment_offset & rte_cpu_to_be_16(RTE_IPV4_HDR_DF_FLAG)) { + edge = FRAG_NEEDED; + } else { + edge = FRAGMENT; + } goto next; } // Determine what is the next node based on the output interface type // By default, it will be eth_output unless another output node was registered. edge = iface_type_edges[iface->type]; - mbuf_data(mbuf)->iface = iface; switch (snat44_process(iface, mbuf)) { case NAT_VERDICT_CONTINUE: @@ -151,7 +157,8 @@ static struct rte_node_register output_node = { [HOLD] = "ip_hold", [NO_ROUTE] = "ip_error_dest_unreach", [ERROR] = "ip_output_error", - [TOO_BIG] = "ip_output_too_big", + [FRAGMENT] = "ip_fragment", + [FRAG_NEEDED] = "ip_error_frag_needed", [DROP] = "ip_output_drop", }, }; @@ -164,5 +171,4 @@ static struct gr_node_info info = { GR_NODE_REGISTER(info); GR_DROP_REGISTER(ip_output_error); -GR_DROP_REGISTER(ip_output_too_big); GR_DROP_REGISTER(ip_output_drop); diff --git a/modules/ip/datapath/meson.build b/modules/ip/datapath/meson.build index 1df71634..cade60ad 100644 --- a/modules/ip/datapath/meson.build +++ b/modules/ip/datapath/meson.build @@ -13,6 +13,7 @@ src += files( 'icmp_output.c', 'ip_error.c', 'ip_forward.c', + 'ip_fragment.c', 'ip_hold.c', 'ip_input.c', 'ip_local.c', diff --git a/smoke/ip_fragment_test.sh b/smoke/ip_fragment_test.sh new file mode 100755 index 00000000..827092b8 --- /dev/null +++ b/smoke/ip_fragment_test.sh @@ -0,0 +1,47 @@ +#!/bin/bash +# SPDX-License-Identifier: BSD-3-Clause +# Copyright (c) 2025 Anthony Harivel +# Test IPv4 fragmentation + +. $(dirname $0)/_init.sh + +p0=${run_id}0 +p1=${run_id}1 + +grcli interface add port $p0 devargs net_tap0,iface=$p0 mac f0:0d:ac:dc:00:00 +# Set smaller MTU on p1 (egress) to force fragmentation +grcli interface add port $p1 devargs net_tap1,iface=$p1 mac f0:0d:ac:dc:00:01 mtu 1280 +grcli address add 172.16.0.1/24 iface $p0 +grcli address add 172.16.1.1/24 iface $p1 + +for n in 0 1; do + p=$run_id$n + netns_add $p + ip link set $p mtu 1500 + ip link set $p netns $p + ip -n $p link set $p address ba:d0:ca:ca:00:0$n + ip -n $p link set $p up + ip -n $p link set lo up + ip -n $p addr add 172.16.$n.2/24 dev $p + ip -n $p route add default via 172.16.$n.1 + # Clear PMTU cache to ensure kernel uses interface MTU + ip -n $p route flush cache +done + +# Test 1: Ping with default packet size (should work without fragmentation) +ip netns exec $p0 ping -i0.01 -c3 -n 172.16.1.2 + +# Test 2: Large packet with DF flag set (should get ICMP fragmentation needed error) +# Send 1260-byte packet with DF=1 (Don't Fragment) +# Packet size: 1260 + 8 (ICMP) + 20 (IP) = 1288 bytes +# Fits in p0 MTU (1500) but exceeds p1 MTU (1280) +# Expected: ICMP Type 3 Code 4 (Fragmentation Needed and DF Set) +ip netns exec $p0 ping -i0.01 -c3 -s 1260 -M do -n 172.16.1.2 && fail "ping with DF flag should have failed" + +# Test 3: Large packet without DF flag (should fragment and succeed) +# Send 1260-byte packet with DF=0 (fragmentation allowed) +# Packet size: 1260 + 8 (ICMP) + 20 (IP) = 1288 bytes +# Fits in p0 MTU (1500) but needs fragmentation for p1 MTU (1280) +# Expected: Packet is fragmented into 2 fragments (1276 + 32 bytes) and ping succeeds +ip netns exec $p0 ip route flush cache +ip netns exec $p0 ping -i0.01 -c3 -s 1260 -M dont -n 172.16.1.2