From ae3bb2a41140e1c7435850bc516ddccd2c9eb0a8 Mon Sep 17 00:00:00 2001 From: Anthony Harivel Date: Wed, 15 Oct 2025 15:16:12 +0200 Subject: [PATCH 1/3] ip: send ICMP error when DF flag is set and packet is too big When a packet exceeds the output interface MTU and has the DF (Don't Fragment) flag set, the router must drop the packet and send an ICMP "Destination Unreachable" error (Type 3, Code 4: Fragmentation Needed and DF Set) back to the source. Previously, grout would silently drop packets that exceeded the MTU regardless of the DF flag setting. This prevented proper Path MTU Discovery (RFC 1191) from functioning. Add a new error node ip_error_frag_needed that generates ICMP Type 3 Code 4 errors using the existing ip_error_process() infrastructure. Signed-off-by: Anthony Harivel Reviewed-by: Christophe Fontaine --- docs/graph.svg | 700 ++++++++++++++++---------------- modules/ip/datapath/ip_error.c | 24 ++ modules/ip/datapath/ip_output.c | 8 +- 3 files changed, 387 insertions(+), 345 deletions(-) diff --git a/docs/graph.svg b/docs/graph.svg index dd059f6f..ef26572e 100644 --- a/docs/graph.svg +++ b/docs/graph.svg @@ -1,853 +1,865 @@ - - - + + grout - + control_input - -control_input + +control_input loopback_input - -loopback_input + +loopback_input control_input->loopback_input - - + + arp_output_request - -arp_output_request + +arp_output_request control_input->arp_output_request - - + + icmp_local_send - -icmp_local_send + +icmp_local_send control_input->icmp_local_send - - + + icmp6_local_send - -icmp6_local_send + +icmp6_local_send control_input->icmp6_local_send - - + + ndp_ns_output - -ndp_ns_output + +ndp_ns_output control_input->ndp_ns_output - - + + ip6_output - -ip6_output + +ip6_output control_input->ip6_output - - + + ndp_na_output - -ndp_na_output + +ndp_na_output control_input->ndp_na_output - - + + ip_output - -ip_output + +ip_output control_input->ip_output - - + + arp_output_reply - -arp_output_reply + +arp_output_reply control_input->arp_output_reply - - + + eth_input - -eth_input + +eth_input loopback_input->eth_input - - + + eth_output - -eth_output + +eth_output arp_output_request->eth_output - - + + icmp_output - -icmp_output + +icmp_output icmp_local_send->icmp_output - - + + - + icmp6_output - -icmp6_output + +icmp6_output - + icmp6_local_send->icmp6_output - - + + - + ndp_ns_output->icmp6_output - - + + - + ip6_output->eth_output - - + + loop_xvrf - -loop_xvrf + +loop_xvrf - + ip6_output->loop_xvrf - - + + - + sr6_output - -sr6_output + +sr6_output - + ip6_output->sr6_output - - + + - + ip6_hold - -ip6_hold + +ip6_hold - + ip6_output->ip6_hold - - + + - + ip6_error_dest_unreach - -ip6_error_dest_unreach + +ip6_error_dest_unreach - + ip6_output->ip6_error_dest_unreach - - + + - + ndp_na_output->icmp6_output - - + + ip_output->eth_output - - + + - + ip_output->loop_xvrf - - + + ip_hold - -ip_hold + +ip_hold ip_output->ip_hold - - + + ip_error_dest_unreach - -ip_error_dest_unreach + +ip_error_dest_unreach ip_output->ip_error_dest_unreach - - + + - + +ip_error_frag_needed + +ip_error_frag_needed + + + +ip_output->ip_error_frag_needed + + + + + ipip_output - -ipip_output + +ipip_output - + ip_output->ipip_output - - + + - + ip_output->sr6_output - - + + arp_output_reply->eth_output - - + + control_output - -control_output + +control_output arp_input - -arp_input + +arp_input eth_input->arp_input - - + + ip_input - -ip_input + +ip_input eth_input->ip_input - - + + ip6_input - -ip6_input + +ip6_input eth_input->ip6_input - - + + arp_input_request - -arp_input_request + +arp_input_request arp_input->arp_input_request - - + + arp_input_reply - -arp_input_reply + +arp_input_reply arp_input->arp_input_reply - - + + ip_input->ip_output - - + + ip_forward - -ip_forward + +ip_forward ip_input->ip_forward - - + + dnat44_dynamic - -dnat44_dynamic + +dnat44_dynamic ip_input->dnat44_dynamic - - + + ip_input_local - -ip_input_local + +ip_input_local ip_input->ip_input_local - - + + ip_input->ip_error_dest_unreach - - + + dnat44_static - -dnat44_static + +dnat44_static ip_input->dnat44_static - - + + - + ip6_input->ip6_output - - + + - + ip6_forward - -ip6_forward + +ip6_forward - + ip6_input->ip6_forward - - + + - + ip6_input_local - -ip6_input_local + +ip6_input_local - + ip6_input->ip6_input_local - - + + - + ip6_input->ip6_error_dest_unreach - - + + - + sr6_local - -sr6_local + +sr6_local - + ip6_input->sr6_local - - + + port_output - -port_output + +port_output eth_output->port_output - - + + port_tx - -port_tx + +port_tx port_output->port_tx - - + + l1_xconnect - -l1_xconnect + +l1_xconnect l1_xconnect->port_output - - + + loopback_output - -loopback_output + +loopback_output loopback_output->control_output - - + + loop_xvrf->ip_input - - + + loop_xvrf->ip6_input - - + + port_rx - -port_rx + +port_rx port_rx->eth_input - - + + port_rx->l1_xconnect - - + + arp_input_request->control_output - - + + arp_input_reply->control_output - - + + icmp_input - -icmp_input + +icmp_input icmp_input->control_output - - + + icmp_input->icmp_output - - + + icmp_output->ip_output - - + + ip_forward->ip_output - - + + ip_error_ttl_exceeded - -ip_error_ttl_exceeded + +ip_error_ttl_exceeded ip_forward->ip_error_ttl_exceeded - - + + ip_hold->control_output - - + + - + dnat44_dynamic->ip_forward - - + + - + dnat44_dynamic->ip_input_local - - + + - + dnat44_dynamic->ip_error_dest_unreach - - + + ip_input_local->icmp_input - - + + ipip_input - -ipip_input + +ipip_input ip_input_local->ipip_input - - + + l4_input_local - -l4_input_local + +l4_input_local ip_input_local->l4_input_local - - + + - + dnat44_static->ip_forward - - + + - + dnat44_static->ip_input_local - - + + - + dnat44_static->ip_error_dest_unreach - - + + - + ipip_input->ip_input - - + + - + l4_loopback_output - -l4_loopback_output + +l4_loopback_output - + l4_input_local->l4_loopback_output - - + + - + ipip_output->ip_output - - + + - + sr6_output->ip6_output - - + + - + icmp6_input - -icmp6_input + +icmp6_input - + icmp6_input->control_output - - + + - + icmp6_input->icmp6_output - - + + - + ndp_ns_input - -ndp_ns_input + +ndp_ns_input - + icmp6_input->ndp_ns_input - - + + - + ndp_na_input - -ndp_na_input + +ndp_na_input - + icmp6_input->ndp_na_input - - + + - + ndp_rs_input - -ndp_rs_input + +ndp_rs_input - + icmp6_input->ndp_rs_input - - + + - + icmp6_output->ip6_output - - + + - + ndp_ns_input->control_output - - + + - + ndp_na_input->control_output - - + + - + ndp_rs_input->control_output - - + + - + ip6_forward->ip6_output - - + + - + ip6_error_ttl_exceeded - -ip6_error_ttl_exceeded + +ip6_error_ttl_exceeded - + ip6_forward->ip6_error_ttl_exceeded - - + + - + ip6_hold->control_output - - + + - + ip6_input_local->l4_input_local - - + + - + ip6_input_local->icmp6_input - - + + - + sr6_local->ip_input - - + + - + sr6_local->ip6_input - - + + - + sr6_local->ip6_input_local - - + + - + sr6_local->ip6_error_dest_unreach - - + + - + l4_loopback_output->loopback_output - - + + diff --git a/modules/ip/datapath/ip_error.c b/modules/ip/datapath/ip_error.c index 4057daef..fd9c79f1 100644 --- a/modules/ip/datapath/ip_error.c +++ b/modules/ip/datapath/ip_error.c @@ -110,6 +110,13 @@ static int no_route_init(const struct rte_graph *, struct rte_node *node) { return 0; } +static int frag_needed_init(const struct rte_graph *, struct rte_node *node) { + struct ip_error_ctx *ctx = ip_error_ctx(node); + ctx->icmp_type = RTE_ICMP_TYPE_DEST_UNREACHABLE; + ctx->icmp_code = RTE_ICMP_CODE_UNREACH_FRAG; + return 0; +} + static struct rte_node_register ip_forward_ttl_exceeded_node = { .name = "ip_error_ttl_exceeded", .process = ip_error_process, @@ -134,6 +141,18 @@ static struct rte_node_register no_route_node = { .init = no_route_init, }; +static struct rte_node_register frag_needed_node = { + .name = "ip_error_frag_needed", + .process = ip_error_process, + .nb_edges = EDGE_COUNT, + .next_nodes = { + [ICMP_OUTPUT] = "icmp_output", + [NO_HEADROOM] = "error_no_headroom", + [NO_IP] = "error_no_local_ip", + }, + .init = frag_needed_init, +}; + static struct gr_node_info info_ttl_exceeded = { .node = &ip_forward_ttl_exceeded_node, }; @@ -142,7 +161,12 @@ static struct gr_node_info info_no_route = { .node = &no_route_node, }; +static struct gr_node_info info_frag_needed = { + .node = &frag_needed_node, +}; + GR_NODE_REGISTER(info_ttl_exceeded); GR_NODE_REGISTER(info_no_route); +GR_NODE_REGISTER(info_frag_needed); GR_DROP_REGISTER(error_no_local_ip); diff --git a/modules/ip/datapath/ip_output.c b/modules/ip/datapath/ip_output.c index 5f2eb316..8a5d21db 100644 --- a/modules/ip/datapath/ip_output.c +++ b/modules/ip/datapath/ip_output.c @@ -27,6 +27,7 @@ enum { NO_ROUTE, ERROR, TOO_BIG, + FRAG_NEEDED, DROP, EDGE_COUNT, }; @@ -89,7 +90,11 @@ ip_output_process(struct rte_graph *graph, struct rte_node *node, void **objs, u } if (rte_pktmbuf_pkt_len(mbuf) > iface->mtu) { - edge = TOO_BIG; + if (ip->fragment_offset & rte_cpu_to_be_16(RTE_IPV4_HDR_DF_FLAG)) { + edge = FRAG_NEEDED; + } else { + edge = TOO_BIG; + } goto next; } @@ -152,6 +157,7 @@ static struct rte_node_register output_node = { [NO_ROUTE] = "ip_error_dest_unreach", [ERROR] = "ip_output_error", [TOO_BIG] = "ip_output_too_big", + [FRAG_NEEDED] = "ip_error_frag_needed", [DROP] = "ip_output_drop", }, }; From 61168c16a6f438d8541f818108ea6d06dc7e3e06 Mon Sep 17 00:00:00 2001 From: Anthony Harivel Date: Mon, 20 Oct 2025 11:23:58 +0200 Subject: [PATCH 2/3] ip: implement output packet fragmentation Add an ip_fragment node that fragments IPv4 packets exceeding the outgoing interface MTU. The node is invoked from ip_output when a packet is too large and does not have the DF (Don't Fragment) flag set. Fragmentation follows RFC 791 requirements: each fragment payload size is rounded down to a multiple of 8 bytes, fragment offset field is expressed in 8-byte units, and the MF (More Fragments) flag is set on all fragments except the last one. All fragments share the same packet ID from the original packet. The ip_output node now stores the output interface in mbuf metadata before the size check, ensuring ip_fragment can access the MTU for proper fragment sizing. The previous ip_output_too_big drop node is removed as oversized packets are now handled by ip_fragment. Fragments inherit trace state from the original packet to maintain debugging capability across fragmentation boundaries. Closes: https://github.com/DPDK/grout/issues/336 Signed-off-by: Anthony Harivel Reviewed-by: Christophe Fontaine --- docs/graph.svg | 788 +++++++++++++++--------------- modules/ip/datapath/ip_fragment.c | 186 +++++++ modules/ip/datapath/ip_output.c | 10 +- modules/ip/datapath/meson.build | 1 + 4 files changed, 595 insertions(+), 390 deletions(-) create mode 100644 modules/ip/datapath/ip_fragment.c diff --git a/docs/graph.svg b/docs/graph.svg index ef26572e..8e1152b1 100644 --- a/docs/graph.svg +++ b/docs/graph.svg @@ -1,865 +1,883 @@ - - - + + grout - + control_input - -control_input + +control_input loopback_input - -loopback_input + +loopback_input control_input->loopback_input - - + + arp_output_request - -arp_output_request + +arp_output_request control_input->arp_output_request - - + + icmp_local_send - -icmp_local_send + +icmp_local_send control_input->icmp_local_send - - + + icmp6_local_send - -icmp6_local_send + +icmp6_local_send control_input->icmp6_local_send - - + + - + -ndp_ns_output - -ndp_ns_output +ndp_na_output + +ndp_na_output - + -control_input->ndp_ns_output - - +control_input->ndp_na_output + + - + -ip6_output - -ip6_output +ndp_ns_output + +ndp_ns_output - + -control_input->ip6_output - - +control_input->ndp_ns_output + + - + -ndp_na_output - -ndp_na_output +ip6_output + +ip6_output - + -control_input->ndp_na_output - - +control_input->ip6_output + + ip_output - -ip_output + +ip_output control_input->ip_output - - + + arp_output_reply - -arp_output_reply + +arp_output_reply control_input->arp_output_reply - - + + eth_input - -eth_input + +eth_input loopback_input->eth_input - - + + eth_output - -eth_output + +eth_output arp_output_request->eth_output - - + + icmp_output - -icmp_output + +icmp_output icmp_local_send->icmp_output - - + + - + icmp6_output - -icmp6_output + +icmp6_output - + icmp6_local_send->icmp6_output - - + + - + +ndp_na_output->icmp6_output + + + + + ndp_ns_output->icmp6_output - - + + - + ip6_output->eth_output - - + + loop_xvrf - -loop_xvrf + +loop_xvrf - + ip6_output->loop_xvrf - - + + - + sr6_output - -sr6_output + +sr6_output - + ip6_output->sr6_output - - + + - + ip6_hold - -ip6_hold + +ip6_hold - + ip6_output->ip6_hold - - + + - + ip6_error_dest_unreach - -ip6_error_dest_unreach + +ip6_error_dest_unreach - + ip6_output->ip6_error_dest_unreach - - - - - -ndp_na_output->icmp6_output - - + + - + ip_output->eth_output - - + + - + ip_output->loop_xvrf - - + + - + +ip_fragment + +ip_fragment + + + +ip_output->ip_fragment + + + + + ip_hold - -ip_hold + +ip_hold - + ip_output->ip_hold - - + + - + ip_error_dest_unreach - -ip_error_dest_unreach + +ip_error_dest_unreach - + ip_output->ip_error_dest_unreach - - + + - + ip_error_frag_needed - -ip_error_frag_needed + +ip_error_frag_needed - + ip_output->ip_error_frag_needed - - + + - + ipip_output - -ipip_output + +ipip_output - + ip_output->ipip_output - - + + - + ip_output->sr6_output - - + + arp_output_reply->eth_output - - + + control_output - -control_output + +control_output arp_input - -arp_input + +arp_input eth_input->arp_input - - + + ip_input - -ip_input + +ip_input eth_input->ip_input - - + + ip6_input - -ip6_input + +ip6_input eth_input->ip6_input - - + + arp_input_request - -arp_input_request + +arp_input_request arp_input->arp_input_request - - + + arp_input_reply - -arp_input_reply + +arp_input_reply arp_input->arp_input_reply - - + + - + ip_input->ip_output - - + + ip_forward - -ip_forward + +ip_forward - + ip_input->ip_forward - - + + - + dnat44_dynamic - -dnat44_dynamic + +dnat44_dynamic - + ip_input->dnat44_dynamic - - + + - + ip_input_local - -ip_input_local + +ip_input_local - + ip_input->ip_input_local - - + + - + ip_input->ip_error_dest_unreach - - + + - + dnat44_static - -dnat44_static + +dnat44_static - + ip_input->dnat44_static - - + + - + ip6_input->ip6_output - - + + - + ip6_forward - -ip6_forward + +ip6_forward - + ip6_input->ip6_forward - - + + - + ip6_input_local - -ip6_input_local + +ip6_input_local - + ip6_input->ip6_input_local - - + + - + ip6_input->ip6_error_dest_unreach - - + + - + sr6_local - -sr6_local + +sr6_local - + ip6_input->sr6_local - - + + port_output - -port_output + +port_output eth_output->port_output - - + + port_tx - -port_tx + +port_tx port_output->port_tx - - + + l1_xconnect - -l1_xconnect + +l1_xconnect l1_xconnect->port_output - - + + loopback_output - -loopback_output + +loopback_output loopback_output->control_output - - + + loop_xvrf->ip_input - - + + loop_xvrf->ip6_input - - + + port_rx - -port_rx + +port_rx port_rx->eth_input - - + + port_rx->l1_xconnect - - + + arp_input_request->control_output - - + + arp_input_reply->control_output - - + + icmp_input - -icmp_input + +icmp_input icmp_input->control_output - - + + icmp_input->icmp_output - - + + icmp_output->ip_output - - + + ip_forward->ip_output - - + + ip_error_ttl_exceeded - -ip_error_ttl_exceeded + +ip_error_ttl_exceeded ip_forward->ip_error_ttl_exceeded - - + + - + +ip_fragment->ip_output + + + + + ip_hold->control_output - - + + - + dnat44_dynamic->ip_forward - - + + - + dnat44_dynamic->ip_input_local - - + + - + dnat44_dynamic->ip_error_dest_unreach - - + + - + ip_input_local->icmp_input - - + + - + ipip_input - -ipip_input + +ipip_input - + ip_input_local->ipip_input - - + + - + l4_input_local - -l4_input_local + +l4_input_local - + ip_input_local->l4_input_local - - + + - + dnat44_static->ip_forward - - + + - + dnat44_static->ip_input_local - - + + - + dnat44_static->ip_error_dest_unreach - - + + - + ipip_input->ip_input - - + + - + l4_loopback_output - -l4_loopback_output + +l4_loopback_output - + l4_input_local->l4_loopback_output - - + + - + ipip_output->ip_output - - + + - + sr6_output->ip6_output - - + + - + icmp6_input - -icmp6_input + +icmp6_input - + icmp6_input->control_output - - + + - + icmp6_input->icmp6_output - - + + - + ndp_ns_input - -ndp_ns_input + +ndp_ns_input - + icmp6_input->ndp_ns_input - - + + - + ndp_na_input - -ndp_na_input + +ndp_na_input - + icmp6_input->ndp_na_input - - + + - + ndp_rs_input - -ndp_rs_input + +ndp_rs_input - + icmp6_input->ndp_rs_input - - + + - + icmp6_output->ip6_output - - + + - + ndp_ns_input->control_output - - + + - + ndp_na_input->control_output - - + + - + ndp_rs_input->control_output - - + + - + ip6_forward->ip6_output - - + + - + ip6_error_ttl_exceeded - -ip6_error_ttl_exceeded + +ip6_error_ttl_exceeded - + ip6_forward->ip6_error_ttl_exceeded - - + + - + ip6_hold->control_output - - + + - + ip6_input_local->l4_input_local - - + + - + ip6_input_local->icmp6_input - - + + - + sr6_local->ip_input - - + + - + sr6_local->ip6_input - - + + - + sr6_local->ip6_input_local - - + + - + sr6_local->ip6_error_dest_unreach - - + + - + l4_loopback_output->loopback_output - - + + diff --git a/modules/ip/datapath/ip_fragment.c b/modules/ip/datapath/ip_fragment.c new file mode 100644 index 00000000..8e5ee1be --- /dev/null +++ b/modules/ip/datapath/ip_fragment.c @@ -0,0 +1,186 @@ +// SPDX-License-Identifier: BSD-3-Clause +// Copyright (c) 2025 Anthony Harivel + +#include +#include +#include +#include +#include +#include +#include + +#include +#include +#include + +#include +#include +#include + +struct ip_fragment_trace_data { + uint16_t packet_id; + uint16_t frag_num; + uint16_t offset; + uint8_t more_frags; +}; + +enum { + IP_OUTPUT = 0, + NO_MBUF, + ALREADY_FRAGMENTED, + ERROR, + EDGE_COUNT, +}; + +static uint16_t +ip_fragment_process(struct rte_graph *graph, struct rte_node *node, void **objs, uint16_t nb_objs) { + struct rte_mbuf *mbuf, *frag_mbuf; + struct rte_ipv4_hdr *ip, *frag_ip; + uint16_t frag_size, frag_data_len; + uint16_t data_len, offset; + const struct iface *iface; + uint16_t num_frags, i; + uint16_t ip_hdr_len; + uint16_t sent = 0; + rte_edge_t edge; + void *payload; + + for (uint16_t j = 0; j < nb_objs; j++) { + mbuf = objs[j]; + ip = rte_pktmbuf_mtod(mbuf, struct rte_ipv4_hdr *); + + // Check if packet is already a fragment - if so, just pass it through + if (ip->fragment_offset + & RTE_BE16(RTE_IPV4_HDR_MF_FLAG | RTE_IPV4_HDR_OFFSET_MASK)) { + // This is already a fragment, drop it + edge = ALREADY_FRAGMENTED; + goto drop; + } + + iface = mbuf_data(mbuf)->iface; + assert(iface != NULL); + + ip_hdr_len = rte_ipv4_hdr_len(ip); + data_len = rte_be_to_cpu_16(ip->total_length) - ip_hdr_len; + + // Calculate fragment payload size (multiple of 8, >= 8) + uint16_t max_payload = (uint16_t)(iface->mtu - ip_hdr_len); + frag_size = RTE_ALIGN_FLOOR(max_payload, 8); + if (unlikely(frag_size < 8)) { + edge = ERROR; + goto drop; + } + + num_frags = (data_len + frag_size - 1) / frag_size; + assert(num_frags > 1); + + // Prepare and enqueue first fragment (using original mbuf) + ip->total_length = rte_cpu_to_be_16(ip_hdr_len + frag_size); + ip->fragment_offset = RTE_BE16(RTE_IPV4_HDR_MF_FLAG); + ip->hdr_checksum = 0; + ip->hdr_checksum = rte_ipv4_cksum(ip); + + if (gr_mbuf_is_traced(mbuf)) { + struct ip_fragment_trace_data *t; + t = gr_mbuf_trace_add(mbuf, node, sizeof(*t)); + t->packet_id = rte_be_to_cpu_16(ip->packet_id); + t->frag_num = 0; + t->offset = 0; + t->more_frags = 1; + } + + // Enqueue first fragment + rte_node_enqueue_x1(graph, node, IP_OUTPUT, mbuf); + sent++; + + // Create and enqueue remaining fragments + for (i = 1; i < num_frags; i++) { + // Create new fragment, copying the original IPv4 header. + frag_mbuf = rte_pktmbuf_copy(mbuf, mbuf->pool, 0, ip_hdr_len); + if (unlikely(frag_mbuf == NULL)) { + break; + } + + frag_ip = rte_pktmbuf_mtod(frag_mbuf, struct rte_ipv4_hdr *); + offset = i * frag_size; + frag_data_len = RTE_MIN(frag_size, data_len - offset); + + payload = rte_pktmbuf_append(frag_mbuf, frag_data_len); + if (unlikely(payload == NULL)) { + rte_pktmbuf_free(frag_mbuf); + break; + } + + memcpy(payload, + rte_pktmbuf_mtod_offset(mbuf, const void *, ip_hdr_len + offset), + frag_data_len); + + frag_ip->total_length = rte_cpu_to_be_16(ip_hdr_len + frag_data_len); + frag_ip->fragment_offset = rte_cpu_to_be_16( + (offset / 8) | ((i < num_frags - 1) ? RTE_IPV4_HDR_MF_FLAG : 0) + ); + frag_ip->hdr_checksum = 0; + frag_ip->hdr_checksum = rte_ipv4_cksum(frag_ip); + + *ip_output_mbuf_data(frag_mbuf) = *ip_output_mbuf_data(mbuf); + frag_mbuf->packet_type = mbuf->packet_type; + if (gr_mbuf_is_traced(mbuf)) { + struct ip_fragment_trace_data *t; + t = gr_mbuf_trace_add(frag_mbuf, node, sizeof(*t)); + t->packet_id = rte_be_to_cpu_16(frag_ip->packet_id); + t->frag_num = i; + t->offset = offset; + t->more_frags = (i < num_frags - 1) ? 1 : 0; + } + + rte_node_enqueue_x1(graph, node, IP_OUTPUT, frag_mbuf); + sent++; + } + + // Trim first fragment to the right size + rte_pktmbuf_trim(mbuf, data_len - frag_size); + + continue; + +drop: + rte_node_enqueue_x1(graph, node, edge, mbuf); + sent++; + } + + return sent; +} + +static int ip_fragment_trace_format(char *buf, size_t len, const void *data, size_t /*data_len*/) { + const struct ip_fragment_trace_data *t = data; + return snprintf( + buf, + len, + "id=%u frag=%u offset=%u%s", + t->packet_id, + t->frag_num, + t->offset, + t->more_frags ? " MF" : "" + ); +} + +static struct rte_node_register fragment_node = { + .name = "ip_fragment", + .process = ip_fragment_process, + .nb_edges = EDGE_COUNT, + .next_nodes = { + [IP_OUTPUT] = "ip_output", + [NO_MBUF] = "error_no_headroom", + [ALREADY_FRAGMENTED] = "ip_fragment_already_fragmented", + [ERROR] = "ip_fragment_error" + }, +}; + +static struct gr_node_info info = { + .node = &fragment_node, + .trace_format = ip_fragment_trace_format, +}; + +GR_NODE_REGISTER(info); + +GR_DROP_REGISTER(ip_fragment_error); +GR_DROP_REGISTER(ip_fragment_already_fragmented); diff --git a/modules/ip/datapath/ip_output.c b/modules/ip/datapath/ip_output.c index 8a5d21db..36e06f50 100644 --- a/modules/ip/datapath/ip_output.c +++ b/modules/ip/datapath/ip_output.c @@ -26,7 +26,7 @@ enum { HOLD, NO_ROUTE, ERROR, - TOO_BIG, + FRAGMENT, FRAG_NEEDED, DROP, EDGE_COUNT, @@ -89,11 +89,13 @@ ip_output_process(struct rte_graph *graph, struct rte_node *node, void **objs, u goto next; } + mbuf_data(mbuf)->iface = iface; + if (rte_pktmbuf_pkt_len(mbuf) > iface->mtu) { if (ip->fragment_offset & rte_cpu_to_be_16(RTE_IPV4_HDR_DF_FLAG)) { edge = FRAG_NEEDED; } else { - edge = TOO_BIG; + edge = FRAGMENT; } goto next; } @@ -101,7 +103,6 @@ ip_output_process(struct rte_graph *graph, struct rte_node *node, void **objs, u // Determine what is the next node based on the output interface type // By default, it will be eth_output unless another output node was registered. edge = iface_type_edges[iface->type]; - mbuf_data(mbuf)->iface = iface; switch (snat44_process(iface, mbuf)) { case NAT_VERDICT_CONTINUE: @@ -156,7 +157,7 @@ static struct rte_node_register output_node = { [HOLD] = "ip_hold", [NO_ROUTE] = "ip_error_dest_unreach", [ERROR] = "ip_output_error", - [TOO_BIG] = "ip_output_too_big", + [FRAGMENT] = "ip_fragment", [FRAG_NEEDED] = "ip_error_frag_needed", [DROP] = "ip_output_drop", }, @@ -170,5 +171,4 @@ static struct gr_node_info info = { GR_NODE_REGISTER(info); GR_DROP_REGISTER(ip_output_error); -GR_DROP_REGISTER(ip_output_too_big); GR_DROP_REGISTER(ip_output_drop); diff --git a/modules/ip/datapath/meson.build b/modules/ip/datapath/meson.build index 1df71634..cade60ad 100644 --- a/modules/ip/datapath/meson.build +++ b/modules/ip/datapath/meson.build @@ -13,6 +13,7 @@ src += files( 'icmp_output.c', 'ip_error.c', 'ip_forward.c', + 'ip_fragment.c', 'ip_hold.c', 'ip_input.c', 'ip_local.c', From 813af553f2bd208db6a39c89ac77b3e0368078b1 Mon Sep 17 00:00:00 2001 From: Anthony Harivel Date: Mon, 20 Oct 2025 11:28:47 +0200 Subject: [PATCH 3/3] smoke: add IPv4 fragmentation test Add a smoke test that verifies IPv4 packet fragmentation behavior by configuring asymmetric MTUs between two TAP interfaces (1500 and 1280 bytes) and sending packets through grout. The test validates three scenarios: normal forwarding without fragmentation, ICMP error generation when a large packet has the DF flag set, and successful fragmentation when the DF flag is not set. This ensures both the fragmentation logic and the DF flag handling work correctly. Signed-off-by: Anthony Harivel Reviewed-by: Christophe Fontaine --- smoke/ip_fragment_test.sh | 47 +++++++++++++++++++++++++++++++++++++++ 1 file changed, 47 insertions(+) create mode 100755 smoke/ip_fragment_test.sh diff --git a/smoke/ip_fragment_test.sh b/smoke/ip_fragment_test.sh new file mode 100755 index 00000000..827092b8 --- /dev/null +++ b/smoke/ip_fragment_test.sh @@ -0,0 +1,47 @@ +#!/bin/bash +# SPDX-License-Identifier: BSD-3-Clause +# Copyright (c) 2025 Anthony Harivel +# Test IPv4 fragmentation + +. $(dirname $0)/_init.sh + +p0=${run_id}0 +p1=${run_id}1 + +grcli interface add port $p0 devargs net_tap0,iface=$p0 mac f0:0d:ac:dc:00:00 +# Set smaller MTU on p1 (egress) to force fragmentation +grcli interface add port $p1 devargs net_tap1,iface=$p1 mac f0:0d:ac:dc:00:01 mtu 1280 +grcli address add 172.16.0.1/24 iface $p0 +grcli address add 172.16.1.1/24 iface $p1 + +for n in 0 1; do + p=$run_id$n + netns_add $p + ip link set $p mtu 1500 + ip link set $p netns $p + ip -n $p link set $p address ba:d0:ca:ca:00:0$n + ip -n $p link set $p up + ip -n $p link set lo up + ip -n $p addr add 172.16.$n.2/24 dev $p + ip -n $p route add default via 172.16.$n.1 + # Clear PMTU cache to ensure kernel uses interface MTU + ip -n $p route flush cache +done + +# Test 1: Ping with default packet size (should work without fragmentation) +ip netns exec $p0 ping -i0.01 -c3 -n 172.16.1.2 + +# Test 2: Large packet with DF flag set (should get ICMP fragmentation needed error) +# Send 1260-byte packet with DF=1 (Don't Fragment) +# Packet size: 1260 + 8 (ICMP) + 20 (IP) = 1288 bytes +# Fits in p0 MTU (1500) but exceeds p1 MTU (1280) +# Expected: ICMP Type 3 Code 4 (Fragmentation Needed and DF Set) +ip netns exec $p0 ping -i0.01 -c3 -s 1260 -M do -n 172.16.1.2 && fail "ping with DF flag should have failed" + +# Test 3: Large packet without DF flag (should fragment and succeed) +# Send 1260-byte packet with DF=0 (fragmentation allowed) +# Packet size: 1260 + 8 (ICMP) + 20 (IP) = 1288 bytes +# Fits in p0 MTU (1500) but needs fragmentation for p1 MTU (1280) +# Expected: Packet is fragmented into 2 fragments (1276 + 32 bytes) and ping succeeds +ip netns exec $p0 ip route flush cache +ip netns exec $p0 ping -i0.01 -c3 -s 1260 -M dont -n 172.16.1.2