Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
786 changes: 408 additions & 378 deletions docs/graph.svg
Copy link
Collaborator

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Isn't it weird to have the node ip_error_frag_needed without any link to icmp_output ?

Copy link
Collaborator Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

No idea.. I guess it's like "ip_error_ttl_exceeded" that doesn't have any link with "icmp_output" ?? Maybe a bug with the graph generation ?

Loading
Sorry, something went wrong. Reload?
Sorry, we cannot display this file.
Sorry, this file is invalid so it cannot be displayed.
24 changes: 24 additions & 0 deletions modules/ip/datapath/ip_error.c
Original file line number Diff line number Diff line change
Expand Up @@ -110,6 +110,13 @@ static int no_route_init(const struct rte_graph *, struct rte_node *node) {
return 0;
}

static int frag_needed_init(const struct rte_graph *, struct rte_node *node) {
struct ip_error_ctx *ctx = ip_error_ctx(node);
ctx->icmp_type = RTE_ICMP_TYPE_DEST_UNREACHABLE;
ctx->icmp_code = RTE_ICMP_CODE_UNREACH_FRAG;
return 0;
}

static struct rte_node_register ip_forward_ttl_exceeded_node = {
.name = "ip_error_ttl_exceeded",
.process = ip_error_process,
Expand All @@ -134,6 +141,18 @@ static struct rte_node_register no_route_node = {
.init = no_route_init,
};

static struct rte_node_register frag_needed_node = {
.name = "ip_error_frag_needed",
.process = ip_error_process,
.nb_edges = EDGE_COUNT,
.next_nodes = {
[ICMP_OUTPUT] = "icmp_output",
[NO_HEADROOM] = "error_no_headroom",
[NO_IP] = "error_no_local_ip",
},
.init = frag_needed_init,
};

static struct gr_node_info info_ttl_exceeded = {
.node = &ip_forward_ttl_exceeded_node,
};
Expand All @@ -142,7 +161,12 @@ static struct gr_node_info info_no_route = {
.node = &no_route_node,
};

static struct gr_node_info info_frag_needed = {
.node = &frag_needed_node,
};

GR_NODE_REGISTER(info_ttl_exceeded);
GR_NODE_REGISTER(info_no_route);
GR_NODE_REGISTER(info_frag_needed);

GR_DROP_REGISTER(error_no_local_ip);
186 changes: 186 additions & 0 deletions modules/ip/datapath/ip_fragment.c
Original file line number Diff line number Diff line change
@@ -0,0 +1,186 @@
// SPDX-License-Identifier: BSD-3-Clause
// Copyright (c) 2025 Anthony Harivel

#include <gr_datapath.h>
#include <gr_graph.h>
#include <gr_iface.h>
#include <gr_ip4_datapath.h>
#include <gr_log.h>
#include <gr_mbuf.h>
#include <gr_trace.h>

#include <rte_byteorder.h>
#include <rte_ip.h>
#include <rte_mbuf.h>

#include <assert.h>
#include <stdint.h>
#include <stdio.h>

struct ip_fragment_trace_data {
uint16_t packet_id;
uint16_t frag_num;
uint16_t offset;
uint8_t more_frags;
};

enum {
IP_OUTPUT = 0,
NO_MBUF,
ALREADY_FRAGMENTED,
ERROR,
EDGE_COUNT,
};

static uint16_t
ip_fragment_process(struct rte_graph *graph, struct rte_node *node, void **objs, uint16_t nb_objs) {
struct rte_mbuf *mbuf, *frag_mbuf;
struct rte_ipv4_hdr *ip, *frag_ip;
uint16_t frag_size, frag_data_len;
uint16_t data_len, offset;
const struct iface *iface;
uint16_t num_frags, i;
uint16_t ip_hdr_len;
uint16_t sent = 0;
rte_edge_t edge;
void *payload;

for (uint16_t j = 0; j < nb_objs; j++) {
mbuf = objs[j];
ip = rte_pktmbuf_mtod(mbuf, struct rte_ipv4_hdr *);

// Check if packet is already a fragment - if so, just pass it through
if (ip->fragment_offset
& RTE_BE16(RTE_IPV4_HDR_MF_FLAG | RTE_IPV4_HDR_OFFSET_MASK)) {
// This is already a fragment, drop it
edge = ALREADY_FRAGMENTED;
goto drop;
}

iface = mbuf_data(mbuf)->iface;
assert(iface != NULL);

ip_hdr_len = rte_ipv4_hdr_len(ip);
data_len = rte_be_to_cpu_16(ip->total_length) - ip_hdr_len;

// Calculate fragment payload size (multiple of 8, >= 8)
uint16_t max_payload = (uint16_t)(iface->mtu - ip_hdr_len);
frag_size = RTE_ALIGN_FLOOR(max_payload, 8);
if (unlikely(frag_size < 8)) {
edge = ERROR;
goto drop;
}

num_frags = (data_len + frag_size - 1) / frag_size;
assert(num_frags > 1);

// Prepare and enqueue first fragment (using original mbuf)
ip->total_length = rte_cpu_to_be_16(ip_hdr_len + frag_size);
ip->fragment_offset = RTE_BE16(RTE_IPV4_HDR_MF_FLAG);
ip->hdr_checksum = 0;
ip->hdr_checksum = rte_ipv4_cksum(ip);

if (gr_mbuf_is_traced(mbuf)) {
struct ip_fragment_trace_data *t;
t = gr_mbuf_trace_add(mbuf, node, sizeof(*t));
t->packet_id = rte_be_to_cpu_16(ip->packet_id);
t->frag_num = 0;
t->offset = 0;
t->more_frags = 1;
}

// Enqueue first fragment
rte_node_enqueue_x1(graph, node, IP_OUTPUT, mbuf);
sent++;

// Create and enqueue remaining fragments
for (i = 1; i < num_frags; i++) {
// Create new fragment, copying the original IPv4 header.
frag_mbuf = rte_pktmbuf_copy(mbuf, mbuf->pool, 0, ip_hdr_len);
if (unlikely(frag_mbuf == NULL)) {
break;
}

frag_ip = rte_pktmbuf_mtod(frag_mbuf, struct rte_ipv4_hdr *);
offset = i * frag_size;
frag_data_len = RTE_MIN(frag_size, data_len - offset);

payload = rte_pktmbuf_append(frag_mbuf, frag_data_len);
if (unlikely(payload == NULL)) {
rte_pktmbuf_free(frag_mbuf);
break;
}

memcpy(payload,
rte_pktmbuf_mtod_offset(mbuf, const void *, ip_hdr_len + offset),
frag_data_len);

frag_ip->total_length = rte_cpu_to_be_16(ip_hdr_len + frag_data_len);
frag_ip->fragment_offset = rte_cpu_to_be_16(
(offset / 8) | ((i < num_frags - 1) ? RTE_IPV4_HDR_MF_FLAG : 0)
);
frag_ip->hdr_checksum = 0;
frag_ip->hdr_checksum = rte_ipv4_cksum(frag_ip);

*ip_output_mbuf_data(frag_mbuf) = *ip_output_mbuf_data(mbuf);
frag_mbuf->packet_type = mbuf->packet_type;
if (gr_mbuf_is_traced(mbuf)) {
struct ip_fragment_trace_data *t;
t = gr_mbuf_trace_add(frag_mbuf, node, sizeof(*t));
t->packet_id = rte_be_to_cpu_16(frag_ip->packet_id);
t->frag_num = i;
t->offset = offset;
t->more_frags = (i < num_frags - 1) ? 1 : 0;
}

rte_node_enqueue_x1(graph, node, IP_OUTPUT, frag_mbuf);
sent++;
}

// Trim first fragment to the right size
rte_pktmbuf_trim(mbuf, data_len - frag_size);

continue;

drop:
rte_node_enqueue_x1(graph, node, edge, mbuf);
sent++;
}

return sent;
}

static int ip_fragment_trace_format(char *buf, size_t len, const void *data, size_t /*data_len*/) {
const struct ip_fragment_trace_data *t = data;
return snprintf(
buf,
len,
"id=%u frag=%u offset=%u%s",
t->packet_id,
t->frag_num,
t->offset,
t->more_frags ? " MF" : ""
);
}

static struct rte_node_register fragment_node = {
.name = "ip_fragment",
.process = ip_fragment_process,
.nb_edges = EDGE_COUNT,
.next_nodes = {
[IP_OUTPUT] = "ip_output",
[NO_MBUF] = "error_no_headroom",
[ALREADY_FRAGMENTED] = "ip_fragment_already_fragmented",
[ERROR] = "ip_fragment_error"
},
};

static struct gr_node_info info = {
.node = &fragment_node,
.trace_format = ip_fragment_trace_format,
};

GR_NODE_REGISTER(info);

GR_DROP_REGISTER(ip_fragment_error);
GR_DROP_REGISTER(ip_fragment_already_fragmented);
16 changes: 11 additions & 5 deletions modules/ip/datapath/ip_output.c
Original file line number Diff line number Diff line change
Expand Up @@ -26,7 +26,8 @@ enum {
HOLD,
NO_ROUTE,
ERROR,
TOO_BIG,
FRAGMENT,
FRAG_NEEDED,
DROP,
EDGE_COUNT,
};
Expand Down Expand Up @@ -88,15 +89,20 @@ ip_output_process(struct rte_graph *graph, struct rte_node *node, void **objs, u
goto next;
}

mbuf_data(mbuf)->iface = iface;

if (rte_pktmbuf_pkt_len(mbuf) > iface->mtu) {
edge = TOO_BIG;
if (ip->fragment_offset & rte_cpu_to_be_16(RTE_IPV4_HDR_DF_FLAG)) {
edge = FRAG_NEEDED;
} else {
edge = FRAGMENT;
}
goto next;
}

// Determine what is the next node based on the output interface type
// By default, it will be eth_output unless another output node was registered.
edge = iface_type_edges[iface->type];
mbuf_data(mbuf)->iface = iface;

switch (snat44_process(iface, mbuf)) {
case NAT_VERDICT_CONTINUE:
Expand Down Expand Up @@ -151,7 +157,8 @@ static struct rte_node_register output_node = {
[HOLD] = "ip_hold",
[NO_ROUTE] = "ip_error_dest_unreach",
[ERROR] = "ip_output_error",
[TOO_BIG] = "ip_output_too_big",
[FRAGMENT] = "ip_fragment",
[FRAG_NEEDED] = "ip_error_frag_needed",
[DROP] = "ip_output_drop",
},
};
Expand All @@ -164,5 +171,4 @@ static struct gr_node_info info = {
GR_NODE_REGISTER(info);

GR_DROP_REGISTER(ip_output_error);
GR_DROP_REGISTER(ip_output_too_big);
GR_DROP_REGISTER(ip_output_drop);
1 change: 1 addition & 0 deletions modules/ip/datapath/meson.build
Original file line number Diff line number Diff line change
Expand Up @@ -13,6 +13,7 @@ src += files(
'icmp_output.c',
'ip_error.c',
'ip_forward.c',
'ip_fragment.c',
'ip_hold.c',
'ip_input.c',
'ip_local.c',
Expand Down
47 changes: 47 additions & 0 deletions smoke/ip_fragment_test.sh
Original file line number Diff line number Diff line change
@@ -0,0 +1,47 @@
#!/bin/bash
# SPDX-License-Identifier: BSD-3-Clause
# Copyright (c) 2025 Anthony Harivel
# Test IPv4 fragmentation

. $(dirname $0)/_init.sh

p0=${run_id}0
p1=${run_id}1

grcli interface add port $p0 devargs net_tap0,iface=$p0 mac f0:0d:ac:dc:00:00
# Set smaller MTU on p1 (egress) to force fragmentation
grcli interface add port $p1 devargs net_tap1,iface=$p1 mac f0:0d:ac:dc:00:01 mtu 1280
grcli address add 172.16.0.1/24 iface $p0
grcli address add 172.16.1.1/24 iface $p1

for n in 0 1; do
p=$run_id$n
netns_add $p
ip link set $p mtu 1500
ip link set $p netns $p
ip -n $p link set $p address ba:d0:ca:ca:00:0$n
ip -n $p link set $p up
ip -n $p link set lo up
ip -n $p addr add 172.16.$n.2/24 dev $p
ip -n $p route add default via 172.16.$n.1
# Clear PMTU cache to ensure kernel uses interface MTU
ip -n $p route flush cache
done

# Test 1: Ping with default packet size (should work without fragmentation)
ip netns exec $p0 ping -i0.01 -c3 -n 172.16.1.2

# Test 2: Large packet with DF flag set (should get ICMP fragmentation needed error)
# Send 1260-byte packet with DF=1 (Don't Fragment)
# Packet size: 1260 + 8 (ICMP) + 20 (IP) = 1288 bytes
# Fits in p0 MTU (1500) but exceeds p1 MTU (1280)
# Expected: ICMP Type 3 Code 4 (Fragmentation Needed and DF Set)
ip netns exec $p0 ping -i0.01 -c3 -s 1260 -M do -n 172.16.1.2 && fail "ping with DF flag should have failed"

# Test 3: Large packet without DF flag (should fragment and succeed)
# Send 1260-byte packet with DF=0 (fragmentation allowed)
# Packet size: 1260 + 8 (ICMP) + 20 (IP) = 1288 bytes
# Fits in p0 MTU (1500) but needs fragmentation for p1 MTU (1280)
# Expected: Packet is fragmented into 2 fragments (1276 + 32 bytes) and ping succeeds
ip netns exec $p0 ip route flush cache
ip netns exec $p0 ping -i0.01 -c3 -s 1260 -M dont -n 172.16.1.2