diff --git a/docs/graph.svg b/docs/graph.svg index 8e1152b13..da536644b 100644 --- a/docs/graph.svg +++ b/docs/graph.svg @@ -4,880 +4,916 @@ - - + + grout - + control_input - -control_input + +control_input loopback_input - -loopback_input + +loopback_input control_input->loopback_input - - + + arp_output_request - -arp_output_request + +arp_output_request control_input->arp_output_request - - + + icmp_local_send - -icmp_local_send + +icmp_local_send control_input->icmp_local_send - - + + icmp6_local_send - -icmp6_local_send + +icmp6_local_send control_input->icmp6_local_send - - + + ndp_na_output - -ndp_na_output + +ndp_na_output control_input->ndp_na_output - - + + ndp_ns_output - -ndp_ns_output + +ndp_ns_output control_input->ndp_ns_output - - + + ip6_output - -ip6_output + +ip6_output control_input->ip6_output - - + + ip_output - -ip_output + +ip_output control_input->ip_output - - + + arp_output_reply - -arp_output_reply + +arp_output_reply control_input->arp_output_reply - - + + eth_input - -eth_input + +eth_input loopback_input->eth_input - - + + eth_output - -eth_output + +eth_output arp_output_request->eth_output - - + + icmp_output - -icmp_output + +icmp_output icmp_local_send->icmp_output - - + + - + icmp6_output - -icmp6_output + +icmp6_output - + icmp6_local_send->icmp6_output - - + + - + ndp_na_output->icmp6_output - - + + - + ndp_ns_output->icmp6_output - - + + - + ip6_output->eth_output - - + + loop_xvrf - -loop_xvrf + +loop_xvrf - + ip6_output->loop_xvrf - - + + - + sr6_output - -sr6_output + +sr6_output - + ip6_output->sr6_output - - + + - + ip6_hold - -ip6_hold + +ip6_hold - + ip6_output->ip6_hold - - + + - + ip6_error_dest_unreach - -ip6_error_dest_unreach + +ip6_error_dest_unreach - + ip6_output->ip6_error_dest_unreach - - + + + + + +ip6_loadbalance + +ip6_loadbalance + + + +ip6_output->ip6_loadbalance + + - + ip_output->eth_output - - + + - + ip_output->loop_xvrf - - + + ip_fragment - -ip_fragment + +ip_fragment - + ip_output->ip_fragment - - + + ip_hold - -ip_hold + +ip_hold - + ip_output->ip_hold - - + + ip_error_dest_unreach - -ip_error_dest_unreach + +ip_error_dest_unreach - + ip_output->ip_error_dest_unreach - - + + + + + +ip_loadbalance + +ip_loadbalance + + + +ip_output->ip_loadbalance + + - + ip_error_frag_needed - -ip_error_frag_needed + +ip_error_frag_needed - + ip_output->ip_error_frag_needed - - + + - + ipip_output - -ipip_output + +ipip_output - + ip_output->ipip_output - - + + - + ip_output->sr6_output - - + + arp_output_reply->eth_output - - + + control_output - -control_output + +control_output arp_input - -arp_input + +arp_input eth_input->arp_input - - + + ip_input - -ip_input + +ip_input eth_input->ip_input - - + + ip6_input - -ip6_input + +ip6_input eth_input->ip6_input - - + + arp_input_request - -arp_input_request + +arp_input_request arp_input->arp_input_request - - + + arp_input_reply - -arp_input_reply + +arp_input_reply arp_input->arp_input_reply - - + + ip_input->ip_output - - + + ip_forward - -ip_forward + +ip_forward ip_input->ip_forward - - + + dnat44_dynamic - -dnat44_dynamic + +dnat44_dynamic ip_input->dnat44_dynamic - - + + ip_input_local - -ip_input_local + +ip_input_local ip_input->ip_input_local - - + + ip_input->ip_error_dest_unreach - - + + dnat44_static - -dnat44_static + +dnat44_static ip_input->dnat44_static - - + + - + ip6_input->ip6_output - - + + - + ip6_forward - -ip6_forward + +ip6_forward - + ip6_input->ip6_forward - - + + - + ip6_input_local - -ip6_input_local + +ip6_input_local - + ip6_input->ip6_input_local - - + + - + ip6_input->ip6_error_dest_unreach - - + + - + sr6_local - -sr6_local + +sr6_local - + ip6_input->sr6_local - - + + port_output - -port_output + +port_output eth_output->port_output - - + + port_tx - -port_tx + +port_tx port_output->port_tx - - + + l1_xconnect - -l1_xconnect + +l1_xconnect l1_xconnect->port_output - - + + loopback_output - -loopback_output + +loopback_output loopback_output->control_output - - + + loop_xvrf->ip_input - - + + loop_xvrf->ip6_input - - + + port_rx - -port_rx + +port_rx port_rx->eth_input - - + + port_rx->l1_xconnect - - + + arp_input_request->control_output - - + + arp_input_reply->control_output - - + + icmp_input - -icmp_input + +icmp_input icmp_input->control_output - - + + icmp_input->icmp_output - - + + icmp_output->ip_output - - + + ip_forward->ip_output - - + + ip_error_ttl_exceeded - -ip_error_ttl_exceeded + +ip_error_ttl_exceeded ip_forward->ip_error_ttl_exceeded - - + + ip_fragment->ip_output - - + + ip_hold->control_output - - + + - + dnat44_dynamic->ip_forward - - + + - + dnat44_dynamic->ip_input_local - - + + - + dnat44_dynamic->ip_error_dest_unreach - - + + - + ip_input_local->icmp_input - - + + - + ipip_input - -ipip_input + +ipip_input - + ip_input_local->ipip_input - - + + - + l4_input_local - -l4_input_local + +l4_input_local - + ip_input_local->l4_input_local - - + + - + dnat44_static->ip_forward - - + + - + dnat44_static->ip_input_local - - + + - + dnat44_static->ip_error_dest_unreach - - + + + + + +ip_loadbalance->ip_output + + - + ipip_input->ip_input - - + + - + l4_loopback_output - -l4_loopback_output + +l4_loopback_output - + l4_input_local->l4_loopback_output - - + + - + ipip_output->ip_output - - + + - + sr6_output->ip6_output - - + + - + icmp6_input - -icmp6_input + +icmp6_input - + icmp6_input->control_output - - + + - + icmp6_input->icmp6_output - - + + - + ndp_ns_input - -ndp_ns_input + +ndp_ns_input - + icmp6_input->ndp_ns_input - - + + - + ndp_na_input - -ndp_na_input + +ndp_na_input - + icmp6_input->ndp_na_input - - + + - + ndp_rs_input - -ndp_rs_input + +ndp_rs_input - + icmp6_input->ndp_rs_input - - + + - + icmp6_output->ip6_output - - + + - + ndp_ns_input->control_output - - + + - + ndp_na_input->control_output - - + + - + ndp_rs_input->control_output - - + + - + ip6_forward->ip6_output - - + + - + ip6_error_ttl_exceeded - -ip6_error_ttl_exceeded + +ip6_error_ttl_exceeded - + ip6_forward->ip6_error_ttl_exceeded - - + + - + ip6_hold->control_output - - + + - + ip6_input_local->l4_input_local - - + + - + ip6_input_local->icmp6_input - - + + - + sr6_local->ip_input - - + + - + sr6_local->ip6_input - - + + - + sr6_local->ip6_input_local - - + + - + sr6_local->ip6_error_dest_unreach - - + + + + + +ip6_loadbalance->ip6_output + + - + l4_loopback_output->loopback_output - - + + diff --git a/frr/rt_grout.c b/frr/rt_grout.c index 5b337aeda..e26454331 100644 --- a/frr/rt_grout.c +++ b/frr/rt_grout.c @@ -289,6 +289,12 @@ static int grout_gr_nexthop_to_frr_nexthop( nexthop_add_srv6_seg6(nh, (void *)sr6->seglist, sr6->n_seglist, encap_behavior); break; } + case GR_NH_T_GROUP: + nh->ifindex = gr_nh->iface_id; + nh->vrf_id = gr_nh->vrf_id; + *nh_family = AF_UNSPEC; + nh->weight = 1; + break; default: gr_log_err( "sync %s nexthops from grout not supported", gr_nh_type_name(gr_nh->type) @@ -572,12 +578,44 @@ enum zebra_dplane_result grout_add_del_route(struct zebra_dplane_ctx *ctx) { return ZEBRA_DPLANE_REQUEST_SUCCESS; } +static enum zebra_dplane_result grout_add_nexthop_group(struct zebra_dplane_ctx *ctx) { + enum zebra_dplane_result ret = ZEBRA_DPLANE_REQUEST_SUCCESS; + uint32_t nh_id = dplane_ctx_get_nhe_id(ctx); + struct gr_nexthop_info_group *group; + struct gr_nh_add_req *req = NULL; + size_t len; + + len = sizeof(*req) + sizeof(*group) + + dplane_ctx_get_nhe_nh_grp_count(ctx) * sizeof(group->members[0]); + if ((req = calloc(1, len)) == NULL) + return ZEBRA_DPLANE_REQUEST_FAILURE; + + group = (struct gr_nexthop_info_group *)req->nh.info; + group->n_members = dplane_ctx_get_nhe_nh_grp_count(ctx); + + req->exist_ok = true; + req->nh.nh_id = nh_id; + req->nh.type = GR_NH_T_GROUP; + req->nh.origin = zebra2origin(dplane_ctx_get_nhe_type(ctx)); + + const struct nh_grp *nhs = dplane_ctx_get_nhe_nh_grp(ctx); + for (size_t i = 0; i < group->n_members; i++) { + group->members[i].nh_id = nhs[i].id; + group->members[i].weight = nhs[i].weight; + } + + if (grout_client_send_recv(GR_NH_ADD, len, req, NULL) < 0) + ret = ZEBRA_DPLANE_REQUEST_FAILURE; + + free(req); + return ret; +} + static enum zebra_dplane_result grout_del_nexthop(uint32_t nh_id) { struct gr_nh_del_req req = {.missing_ok = true, .nh_id = nh_id}; if (grout_client_send_recv(GR_NH_DEL, sizeof(req), &req, NULL) < 0) return ZEBRA_DPLANE_REQUEST_FAILURE; - return ZEBRA_DPLANE_REQUEST_SUCCESS; } @@ -762,15 +800,12 @@ enum zebra_dplane_result grout_add_del_nexthop(struct zebra_dplane_ctx *ctx) { return ZEBRA_DPLANE_REQUEST_FAILURE; } - if (dplane_ctx_get_nhe_nh_grp_count(ctx)) { - // next group are not supported in grout - gr_log_err("impossible to add/del nexthop grout %u (nhg not supported)", nh_id); - return ZEBRA_DPLANE_REQUEST_FAILURE; - } - if (dplane_ctx_get_op(ctx) == DPLANE_OP_NH_DELETE) return grout_del_nexthop(nh_id); + if (dplane_ctx_get_nhe_nh_grp_count(ctx)) + return grout_add_nexthop_group(ctx); + return grout_add_nexthop(nh_id, origin, dplane_ctx_get_nhe_ng(ctx)->nexthop); } diff --git a/modules/infra/api/gr_nexthop.h b/modules/infra/api/gr_nexthop.h index 4f429bec9..bbe93fd0e 100644 --- a/modules/infra/api/gr_nexthop.h +++ b/modules/infra/api/gr_nexthop.h @@ -32,6 +32,7 @@ typedef enum : uint8_t { GR_NH_T_DNAT, GR_NH_T_BLACKHOLE, GR_NH_T_REJECT, + GR_NH_T_GROUP, // ECMP #define GR_NH_T_ALL UINT8_C(0xff) } gr_nh_type_t; @@ -99,6 +100,17 @@ struct gr_nexthop_info_l3 { struct rte_ether_addr mac; //!< link-layer address }; +// Info for GR_NH_T_GROUP nexthops +struct gr_nexthop_group_member { + uint32_t nh_id; + uint32_t weight; +}; + +struct gr_nexthop_info_group { + uint32_t n_members; + struct gr_nexthop_group_member members[]; +}; + //! Nexthop structure exposed to the API. struct gr_nexthop { BASE(gr_nexthop_base); @@ -166,6 +178,8 @@ static inline const char *gr_nh_type_name(const gr_nh_type_t type) { return "blackhole"; case GR_NH_T_REJECT: return "reject"; + case GR_NH_T_GROUP: + return "group"; } return "?"; } diff --git a/modules/infra/cli/nexthop.c b/modules/infra/cli/nexthop.c index a4194437e..8917ee097 100644 --- a/modules/infra/cli/nexthop.c +++ b/modules/infra/cli/nexthop.c @@ -111,6 +111,19 @@ static ssize_t format_nexthop_info_void(char *, size_t, const void *) { return 0; } +static ssize_t format_nexthop_info_group(char *buf, size_t len, const void *info) { + const struct gr_nexthop_info_group *grp = info; + ssize_t n = 0; + + for (uint32_t i = 0; i < grp->n_members; i++) + SAFE_BUF( + snprintf, len, "id(%u/%u) ", grp->members[i].nh_id, grp->members[i].weight + ); + return n; +err: + return -errno; +} + static struct cli_nexthop_formatter blackhole_formatter = { .name = "blackhole", .type = GR_NH_T_BLACKHOLE, @@ -123,6 +136,12 @@ static struct cli_nexthop_formatter reject_formatter = { .format = format_nexthop_info_void, }; +static struct cli_nexthop_formatter group_formatter = { + .name = "group", + .type = GR_NH_T_GROUP, + .format = format_nexthop_info_group, +}; + static int complete_nh_types( struct gr_api_client *, const struct ec_node *node, @@ -284,6 +303,54 @@ static cmd_status_t nh_del(struct gr_api_client *c, const struct ec_pnode *p) { return CMD_SUCCESS; } +static cmd_status_t nh_group_add(struct gr_api_client *c, const struct ec_pnode *p) { + struct gr_nexthop_info_group *group; + struct gr_nh_add_req *req = NULL; + const struct ec_pnode *n = NULL; + cmd_status_t ret = CMD_ERROR; + uint32_t n_members = 0; + size_t len; + + while ((n = ec_pnode_find_next(p, n, "MEMBER", false)) != NULL) { + n_members++; + } + n = NULL; + + len = sizeof(*req) + sizeof(*group) + n_members * sizeof(group->members[0]); + if ((req = calloc(1, len)) == NULL) { + errno = ENOMEM; + goto out; + } + + req->exist_ok = true; + req->nh.type = GR_NH_T_GROUP; + req->nh.origin = GR_NH_ORIGIN_USER; + + if (arg_u32(p, "ID", &req->nh.nh_id) < 0 && errno != ENOENT) + goto out; + + group = (struct gr_nexthop_info_group *)req->nh.info; + + while ((n = ec_pnode_find_next(p, n, "MEMBER", false)) != NULL) { + if (arg_u32(n, "NHID", &group->members[group->n_members].nh_id) < 0) + goto out; + if (arg_u32(n, "WEIGHT", &group->members[group->n_members].weight) < 0) { + if (errno == ENOENT) + group->members[group->n_members].weight = 1; + else + goto out; + } + group->n_members++; + } + + if (gr_api_client_send_recv(c, GR_NH_ADD, len, req, NULL) < 0) + goto out; + ret = CMD_SUCCESS; +out: + free(req); + return ret; +} + static cmd_status_t nh_list(struct gr_api_client *c, const struct ec_pnode *p) { struct gr_nh_list_req req = {.vrf_id = GR_VRF_ID_ALL, .type = GR_NH_T_ALL}; const struct gr_nexthop *nh; @@ -414,6 +481,24 @@ static int ctx_init(struct ec_node *root) { with_help("Blackhole nexthop.", ec_node_str("blackhole", "blackhole")), with_help("Reject nexthop sending ICMP UNREACH.", ec_node_str("reject", "reject")) ); + if (ret < 0) + return ret; + ret = CLI_COMMAND( + NEXTHOP_ADD_CTX(root), + "group [(id ID)] (member MEMBER)*", + nh_group_add, + "Add a new nexthop group.", + with_help("Nexthop ID.", ec_node_uint("ID", 1, UINT32_MAX - 1, 10)), + with_help( + "Nexthop member ID with relative weight.", + EC_NODE_CMD( + "MEMBER", + "NHID [weight WEIGHT]", + ec_node_uint("NHID", 1, UINT32_MAX - 1, 10), + ec_node_uint("WEIGHT", 1, UINT32_MAX - 1, 10) + ) + ) + ); if (ret < 0) return ret; ret = CLI_COMMAND( @@ -488,4 +573,5 @@ static void __attribute__((constructor, used)) init(void) { cli_nexthop_formatter_register(&l3_formatter); cli_nexthop_formatter_register(&blackhole_formatter); cli_nexthop_formatter_register(&reject_formatter); + cli_nexthop_formatter_register(&group_formatter); } diff --git a/modules/infra/control/gr_nh_control.h b/modules/infra/control/gr_nh_control.h index b0c48554a..ea98f3e6d 100644 --- a/modules/infra/control/gr_nh_control.h +++ b/modules/infra/control/gr_nh_control.h @@ -54,6 +54,16 @@ struct hoplist { gr_vec struct nexthop **nh; }; +struct nh_group_member { + struct nexthop *nh; + uint32_t weight; +}; + +GR_NH_TYPE_INFO(GR_NH_T_GROUP, nexthop_info_group, { + uint32_t n_members; + struct nh_group_member *members; +}); + // Lookup a nexthop from the global pool that matches the specified criteria. struct nexthop * nexthop_lookup(addr_family_t af, uint16_t vrf_id, uint16_t iface_id, const void *addr); diff --git a/modules/infra/control/nexthop.c b/modules/infra/control/nexthop.c index fbcb28355..0725cf7df 100644 --- a/modules/infra/control/nexthop.c +++ b/modules/infra/control/nexthop.c @@ -313,6 +313,7 @@ void nexthop_type_ops_register(gr_nh_type_t type, const struct nexthop_type_ops case GR_NH_T_DNAT: case GR_NH_T_BLACKHOLE: case GR_NH_T_REJECT: + case GR_NH_T_GROUP: if (ops == NULL) ABORT("invalid type ops"); if (type_ops[type] != NULL) @@ -335,6 +336,7 @@ struct nexthop *nexthop_new(const struct gr_nexthop_base *base, const void *info case GR_NH_T_DNAT: case GR_NH_T_BLACKHOLE: case GR_NH_T_REJECT: + case GR_NH_T_GROUP: break; default: ABORT("invalid nexthop type %hhu", base->type); @@ -483,6 +485,28 @@ struct nexthop *nexthop_lookup_by_id(uint32_t nh_id) { return data; } +static void nh_groups_remove_member(const struct nexthop *nh) { + struct nexthop_info_group *info; + struct nexthop *group; + uint32_t next = 0; + const void *key; + void *data; + + while (rte_hash_iterate(hash_by_id, &key, &data, &next) >= 0) { + group = data; + if (group->type != GR_NH_T_GROUP) + continue; + info = nexthop_info_group(group); + for (uint32_t i = 0; i < info->n_members; i++) { + if (info->members[i].nh == nh) { + info->members[i].nh = info->members[info->n_members - 1].nh; + info->members[i].weight = info->members[info->n_members - 1].weight; + info->n_members--; + } + } + } +} + void nexthop_routes_cleanup(struct nexthop *nh) { const struct nexthop_af_ops *ops; for (unsigned i = 0; i < ARRAY_DIM(af_ops); i++) { @@ -513,8 +537,8 @@ void nexthop_decref(struct nexthop *nh) { if (nh->origin != GR_NH_ORIGIN_INTERNAL) gr_event_push(GR_EVENT_NEXTHOP_DELETE, nh); + nh_groups_remove_member(nh); nexthop_id_put(nh); - rte_rcu_qsbr_synchronize(gr_datapath_rcu(), RTE_QSBR_THRID_INVALID); assert(nh_stats.total > 0); @@ -525,7 +549,6 @@ void nexthop_decref(struct nexthop *nh) { const struct nexthop_type_ops *ops = type_ops[nh->type]; if (ops != NULL && ops->free != NULL) ops->free(nh); - rte_mempool_put(pool, nh); } } @@ -795,6 +818,100 @@ static struct nexthop_type_ops l3_nh_ops = { .to_api = l3_to_api, }; +static bool group_equal(const struct nexthop *a, const struct nexthop *b) { + const struct nexthop_info_group *da = nexthop_info_group(a); + const struct nexthop_info_group *db = nexthop_info_group(b); + + if (da->n_members != db->n_members) + return false; + for (uint32_t i = 0; i < da->n_members; i++) + if (da->members[i].nh != db->members[i].nh + || da->members[i].weight != db->members[i].weight) + return false; + return true; +} + +static void group_free(struct nexthop *nh) { + struct nexthop_info_group *pvt = nexthop_info_group(nh); + + for (uint32_t i = 0; i < pvt->n_members; i++) + nexthop_decref(pvt->members[i].nh); + rte_free(pvt->members); +} + +static int group_import_info(struct nexthop *nh, const void *info) { + struct nexthop_info_group *pvt = nexthop_info_group(nh); + const struct gr_nexthop_info_group *group = info; + struct nh_group_member *members, *tmp; + uint32_t n_tmp; + + members = rte_zmalloc( + __func__, group->n_members * sizeof(pvt->members[0]), RTE_CACHE_LINE_SIZE + ); + if (group->n_members > 0 && members == NULL) + return errno_set(ENOMEM); + + for (uint32_t i = 0; i < group->n_members; i++) { + struct nexthop *nh = nexthop_lookup_by_id(group->members[i].nh_id); + if (nh) { + members[i].nh = nh; + members[i].weight = group->members[i].weight; + } else { + rte_free(members); + return errno_set(ENOENT); + } + } + + for (uint32_t i = 0; i < group->n_members; i++) + nexthop_incref(members[i].nh); + + n_tmp = pvt->n_members; + tmp = pvt->members; + pvt->n_members = group->n_members; + pvt->members = members; + + rte_rcu_qsbr_synchronize(gr_datapath_rcu(), RTE_QSBR_THRID_INVALID); + + for (uint32_t i = 0; i < n_tmp; i++) + nexthop_decref(tmp[i].nh); + + rte_free(tmp); + + return 0; +} + +static struct gr_nexthop *group_to_api(const struct nexthop *nh, size_t *len) { + const struct nexthop_info_group *group_priv = nexthop_info_group(nh); + struct gr_nexthop_info_group *group_pub; + struct gr_nexthop *pub; + *len = sizeof(*pub) + sizeof(*group_pub) + + group_priv->n_members * sizeof(group_priv->members[0]); + + pub = malloc(*len); + if (pub == NULL) { + *len = 0; + return errno_set_null(ENOMEM); + } + + pub->base = nh->base; + group_pub = (struct gr_nexthop_info_group *)pub->info; + + group_pub->n_members = group_priv->n_members; + for (uint32_t i = 0; i < group_pub->n_members; i++) { + group_pub->members[i].nh_id = group_priv->members[i].nh->nh_id; + group_pub->members[i].weight = group_priv->members[i].weight; + } + + return pub; +} + +static struct nexthop_type_ops group_nh_ops = { + .equal = group_equal, + .free = group_free, + .import_info = group_import_info, + .to_api = group_to_api, +}; + RTE_INIT(init) { gr_event_register_serializer(&nh_serializer); gr_register_module(&module); @@ -802,4 +919,5 @@ RTE_INIT(init) { "/grout/nexthop/stats", telemetry_nexthop_stats_get, "Get nexthop statistics" ); nexthop_type_ops_register(GR_NH_T_L3, &l3_nh_ops); + nexthop_type_ops_register(GR_NH_T_GROUP, &group_nh_ops); } diff --git a/modules/ip/cli/icmp.c b/modules/ip/cli/icmp.c index e0e52ab7b..f2ced8de6 100644 --- a/modules/ip/cli/icmp.c +++ b/modules/ip/cli/icmp.c @@ -33,29 +33,28 @@ static cmd_status_t icmp_send( struct gr_ip4_icmp_send_req *req, uint16_t msdelay, uint16_t count, + uint16_t ident, bool mode_traceroute ) { struct gr_ip4_icmp_recv_resp *reply_resp; struct gr_ip4_icmp_recv_req reply_req; int timeout, ret, errors; void *resp_ptr = NULL; - uint16_t ping_id; stop = false; errors = 0; errno = 0; - ping_id = random(); for (int i = mode_traceroute; i < count && stop == false; i++) { req->ttl = mode_traceroute ? i : 64; - req->ident = ping_id; + req->ident = ident; req->seq_num = i; ret = gr_api_client_send_recv(c, GR_IP4_ICMP_SEND, sizeof(*req), req, NULL); if (ret < 0) return CMD_ERROR; - reply_req.ident = ping_id; + reply_req.ident = ident; reply_req.seq_num = i; timeout = 50; do { @@ -132,6 +131,7 @@ static cmd_status_t ping(struct gr_api_client *c, const struct ec_pnode *p) { struct gr_ip4_icmp_send_req req = {.seq_num = 0, .vrf = 0}; cmd_status_t ret = CMD_ERROR; uint16_t count = UINT16_MAX; + uint16_t ident = random(); uint16_t msdelay = 1000; if (arg_ip4(p, "IP", &req.addr) < 0) @@ -142,12 +142,14 @@ static cmd_status_t ping(struct gr_api_client *c, const struct ec_pnode *p) { return CMD_ERROR; if ((ret = arg_u16(p, "DELAY", &msdelay)) < 0 && ret != ENOENT) return CMD_ERROR; + if ((ret = arg_u16(p, "IDENT", &ident)) < 0 && ret != ENOENT) + return CMD_ERROR; sighandler_t prev_handler = signal(SIGINT, sighandler); if (prev_handler == SIG_ERR) return CMD_ERROR; - ret = icmp_send(c, &req, msdelay, count, false); + ret = icmp_send(c, &req, msdelay, count, ident, false); signal(SIGINT, prev_handler); @@ -157,9 +159,12 @@ static cmd_status_t ping(struct gr_api_client *c, const struct ec_pnode *p) { static cmd_status_t traceroute(struct gr_api_client *c, const struct ec_pnode *p) { struct gr_ip4_icmp_send_req req = {.seq_num = 0, .vrf = 0}; cmd_status_t ret = CMD_SUCCESS; + uint16_t ident = random(); if (arg_ip4(p, "IP", &req.addr) < 0) return CMD_ERROR; + if ((ret = arg_u16(p, "IDENT", &ident)) < 0 && ret != ENOENT) + return CMD_ERROR; if ((ret = arg_u16(p, "VRF", &req.vrf)) < 0 && ret != ENOENT) return CMD_ERROR; @@ -167,7 +172,7 @@ static cmd_status_t traceroute(struct gr_api_client *c, const struct ec_pnode *p if (prev_handler == SIG_ERR) return CMD_ERROR; - ret = icmp_send(c, &req, 0, 255, true); + ret = icmp_send(c, &req, 0, 255, ident, true); signal(SIGINT, prev_handler); @@ -181,23 +186,31 @@ static int ctx_init(struct ec_node *root) { CLI_CONTEXT( root, CTX_ARG("ping", "Send IPv4 ICMP echo requests and wait for replies.") ), - "IP [vrf VRF] [count COUNT] [delay DELAY]", + "IP [vrf VRF] [count COUNT] [delay DELAY] [ident IDENT]", ping, "Send IPv4 ICMP echo requests and wait for replies.", with_help("IPv4 destination address.", ec_node_re("IP", IPV4_RE)), with_help("L3 routing domain ID.", ec_node_uint("VRF", 0, UINT16_MAX - 1, 10)), with_help("Number of packets to send.", ec_node_uint("COUNT", 1, UINT16_MAX, 10)), - with_help("Delay in ms between icmp echo.", ec_node_uint("DELAY", 0, 10000, 10)) + with_help("Delay in ms between icmp echo.", ec_node_uint("DELAY", 0, 10000, 10)), + with_help( + "Icmp ident field (default: random).", + ec_node_uint("IDENT", 1, UINT16_MAX, 10) + ) ); if (ret < 0) return ret; ret = CLI_COMMAND( CLI_CONTEXT(root, CTX_ARG("traceroute", "Discover IPv4 intermediate gateways.")), - "IP [vrf VRF]", + "IP [ident IDENT] [vrf VRF]", traceroute, "Discover IPv4 intermediate gateways.", with_help("IPv4 destination address.", ec_node_re("IP", IPV4_RE)), + with_help( + "Icmp ident field (default: random).", + ec_node_uint("IDENT", 1, UINT16_MAX, 10) + ), with_help("L3 routing domain ID.", ec_node_uint("VRF", 0, UINT16_MAX - 1, 10)) ); diff --git a/modules/ip/datapath/icmp_local_send.c b/modules/ip/datapath/icmp_local_send.c index 33c5fc20a..596ce4619 100644 --- a/modules/ip/datapath/icmp_local_send.c +++ b/modules/ip/datapath/icmp_local_send.c @@ -46,6 +46,13 @@ int icmp_local_send( struct ctl_to_stack *msg; int ret; + if (gw->type == GR_NH_T_GROUP) { + struct nexthop_info_group *g = (struct nexthop_info_group *)gw->info; + if (g->n_members == 0) + return errno_set(EHOSTUNREACH); + gw = g->members[ident % g->n_members].nh; + } + if ((msg = calloc(1, sizeof(struct ctl_to_stack))) == NULL) return errno_set(ENOMEM); @@ -105,6 +112,11 @@ static uint16_t icmp_local_send_process( icmp->icmp_seq_nb = rte_cpu_to_be_16(msg->seq_num); icmp->icmp_ident = rte_cpu_to_be_16(msg->ident); + // Fake RSS to spread the traffic + // for ECMP routes or active/active bonds. + mbuf->hash.rss = msg->ident; + mbuf->ol_flags |= RTE_MBUF_F_RX_RSS_HASH; + data = ip_local_mbuf_data(mbuf); data->proto = IPPROTO_ICMP; data->len = sizeof(*icmp) + sizeof(clock_t); diff --git a/modules/ip/datapath/ip_loadbalance.c b/modules/ip/datapath/ip_loadbalance.c new file mode 100644 index 000000000..f5b2037c0 --- /dev/null +++ b/modules/ip/datapath/ip_loadbalance.c @@ -0,0 +1,73 @@ +// SPDX-License-Identifier: BSD-3-Clause +// Copyright (c) 2025 Christophe Fontaine + +#include +#include +#include +#include +#include + +#include +#include +#include + +enum edges { + OUTPUT = 0, + NO_NEXTHOP, + EDGE_COUNT, +}; + +static uint16_t ip_loadbalance_process( + struct rte_graph *graph, + struct rte_node *node, + void **objs, + uint16_t nb_objs +) { + struct ip_output_mbuf_data *d; + struct nexthop_info_group *g; + struct rte_mbuf *mbuf; + rte_edge_t edge; + uint16_t i; + + for (i = 0; i < nb_objs; i++) { + mbuf = objs[i]; + d = ip_output_mbuf_data(mbuf); + g = (struct nexthop_info_group *)d->nh->info; + edge = OUTPUT; + if (unlikely(g->n_members == 0)) { + edge = NO_NEXTHOP; + goto next; + } + // TODO: increment xstat on ! mbuf->ol_flags & RTE_MBUF_F_RX_RSS_HASH + d->nh = g->members[mbuf->hash.rss % g->n_members].nh; +next: + if (gr_mbuf_is_traced(mbuf)) + gr_mbuf_trace_add(mbuf, node, 0); + + rte_node_enqueue_x1(graph, node, edge, mbuf); + } + + return nb_objs; +} + +static void loadbalance_register(void) { + ip_output_register_nexthop_type(GR_NH_T_GROUP, "ip_loadbalance"); +} + +static struct rte_node_register ip_lb_node = { + .name = "ip_loadbalance", + .process = ip_loadbalance_process, + .nb_edges = EDGE_COUNT, + .next_nodes = { + [OUTPUT] = "ip_output", + [NO_NEXTHOP] = "ip_lb_no_nexthop", + }, +}; + +static struct gr_node_info info_loadbalance = { + .node = &ip_lb_node, + .register_callback = loadbalance_register, +}; + +GR_NODE_REGISTER(info_loadbalance); +GR_DROP_REGISTER(ip_lb_no_nexthop); diff --git a/modules/ip/datapath/meson.build b/modules/ip/datapath/meson.build index cade60adc..4721b262e 100644 --- a/modules/ip/datapath/meson.build +++ b/modules/ip/datapath/meson.build @@ -16,6 +16,7 @@ src += files( 'ip_fragment.c', 'ip_hold.c', 'ip_input.c', + 'ip_loadbalance.c', 'ip_local.c', 'ip_output.c', ) diff --git a/modules/ip6/cli/icmp6.c b/modules/ip6/cli/icmp6.c index acea6d659..1eae00f5c 100644 --- a/modules/ip6/cli/icmp6.c +++ b/modules/ip6/cli/icmp6.c @@ -25,22 +25,21 @@ static cmd_status_t icmp_send( struct gr_ip6_icmp_send_req *req, uint16_t msdelay, uint16_t count, + uint16_t ident, bool mode_traceroute ) { struct gr_ip6_icmp_recv_resp *reply_resp; struct gr_ip6_icmp_recv_req reply_req; int i, timeout, ret, errors; void *resp_ptr = NULL; - uint16_t ping_id; const char *errdesc; stop = false; errors = 0; errno = 0; - ping_id = random(); for (i = !!mode_traceroute; i < count && stop == false; i++) { - req->ident = ping_id; + req->ident = ident; req->seq_num = i; req->ttl = mode_traceroute ? i : 64; @@ -48,7 +47,7 @@ static cmd_status_t icmp_send( if (ret < 0) return CMD_ERROR; - reply_req.ident = ping_id; + reply_req.ident = ident; reply_req.seq_num = i; timeout = 50; do { @@ -142,6 +141,7 @@ static cmd_status_t ping(struct gr_api_client *c, const struct ec_pnode *p) { struct gr_ip6_icmp_send_req req = {.iface = GR_IFACE_ID_UNDEF, .vrf = 0}; cmd_status_t ret = CMD_ERROR; uint16_t count = UINT16_MAX; + uint16_t ident = random(); uint16_t msdelay = 1000; const char *str; @@ -153,6 +153,8 @@ static cmd_status_t ping(struct gr_api_client *c, const struct ec_pnode *p) { return CMD_ERROR; if ((ret = arg_u16(p, "DELAY", &msdelay)) < 0 && ret != ENOENT) return CMD_ERROR; + if ((ret = arg_u16(p, "IDENT", &ident)) < 0 && ret != ENOENT) + return CMD_ERROR; if ((str = arg_str(p, "IFACE")) != NULL) { struct gr_iface *iface = iface_from_name(c, str); if (iface == NULL) @@ -165,7 +167,7 @@ static cmd_status_t ping(struct gr_api_client *c, const struct ec_pnode *p) { if (prev_handler == SIG_ERR) return CMD_ERROR; - ret = icmp_send(c, &req, msdelay, count, false); + ret = icmp_send(c, &req, msdelay, count, ident, false); signal(SIGINT, prev_handler); @@ -175,12 +177,15 @@ static cmd_status_t ping(struct gr_api_client *c, const struct ec_pnode *p) { static cmd_status_t traceroute(struct gr_api_client *c, const struct ec_pnode *p) { struct gr_ip6_icmp_send_req req = {.iface = GR_IFACE_ID_UNDEF, .vrf = 0}; cmd_status_t ret = CMD_SUCCESS; + uint16_t ident = random(); const char *str; if (arg_ip6(p, "DEST", &req.addr) < 0) return CMD_ERROR; if ((ret = arg_u16(p, "VRF", &req.vrf)) < 0 && ret != ENOENT) return CMD_ERROR; + if ((ret = arg_u16(p, "IDENT", &ident)) < 0 && ret != ENOENT) + return CMD_ERROR; if ((str = arg_str(p, "IFACE")) != NULL) { struct gr_iface *iface = iface_from_name(c, str); if (iface == NULL) @@ -193,7 +198,7 @@ static cmd_status_t traceroute(struct gr_api_client *c, const struct ec_pnode *p if (prev_handler == SIG_ERR) return CMD_ERROR; - ret = icmp_send(c, &req, 0, 255, true); + ret = icmp_send(c, &req, 0, 255, ident, true); signal(SIGINT, prev_handler); @@ -207,7 +212,7 @@ static int ctx_init(struct ec_node *root) { CLI_CONTEXT( root, CTX_ARG("ping", "Send ICMPv6 echo requests and wait for replies.") ), - "DEST [vrf VRF] [count COUNT] [delay DELAY] [iface IFACE]", + "DEST [vrf VRF] [count COUNT] [delay DELAY] [iface IFACE] [ident IDENT]", ping, "Send ICMPv6 echo requests and wait for replies.", with_help("IPv6 destination address.", ec_node_re("DEST", IPV6_RE)), @@ -217,14 +222,18 @@ static int ctx_init(struct ec_node *root) { ), with_help("L3 routing domain ID.", ec_node_uint("VRF", 0, UINT16_MAX - 1, 10)), with_help("Number of packets to send.", ec_node_uint("COUNT", 1, UINT16_MAX, 10)), - with_help("Delay in ms between icmp6 echo.", ec_node_uint("DELAY", 0, 10000, 10)) + with_help("Delay in ms between icmp6 echo.", ec_node_uint("DELAY", 0, 10000, 10)), + with_help( + "Icmp ident field (default: random).", + ec_node_uint("IDENT", 1, UINT16_MAX, 10) + ) ); if (ret < 0) return ret; ret = CLI_COMMAND( CLI_CONTEXT(root, CTX_ARG("traceroute", "Discover IPv6 intermediate gateways.")), - "DEST [vrf VRF] [iface IFACE]", + "DEST [vrf VRF] [iface IFACE] [ident IDENT]", traceroute, "Discover IPv6 intermediate gateways.", with_help("IPv6 destination address.", ec_node_re("DEST", IPV6_RE)), @@ -232,6 +241,10 @@ static int ctx_init(struct ec_node *root) { with_help( "Output interface name.", ec_node_dyn("IFACE", complete_iface_names, INT2PTR(GR_IFACE_TYPE_UNDEF)) + ), + with_help( + "Icmp ident field (default: random).", + ec_node_uint("IDENT", 1, UINT16_MAX, 10) ) ); diff --git a/modules/ip6/datapath/icmp6_local_send.c b/modules/ip6/datapath/icmp6_local_send.c index 05a6b23d6..8a59f813c 100644 --- a/modules/ip6/datapath/icmp6_local_send.c +++ b/modules/ip6/datapath/icmp6_local_send.c @@ -46,6 +46,13 @@ int icmp6_local_send( const struct nexthop *local; int ret; + if (gw->type == GR_NH_T_GROUP) { + struct nexthop_info_group *g = (struct nexthop_info_group *)gw->info; + if (g->n_members == 0) + return errno_set(EHOSTUNREACH); + gw = g->members[ident % g->n_members].nh; + } + if ((local = addr6_get_preferred(gw->iface_id, &nexthop_info_l3(gw)->ipv6)) == NULL) return -errno; @@ -94,6 +101,11 @@ static uint16_t icmp6_local_send_process( icmp6_echo->ident = rte_cpu_to_be_16(msg->ident); icmp6_echo->seqnum = rte_cpu_to_be_16(msg->seq_num); + // Fake RSS to spread the traffic + // for ECMP routes or active/active bonds. + mbuf->hash.rss = msg->ident; + mbuf->ol_flags |= RTE_MBUF_F_RX_RSS_HASH; + payload = PAYLOAD(icmp6_echo); *payload = gr_clock_us(); diff --git a/modules/ip6/datapath/ip6_loadbalance.c b/modules/ip6/datapath/ip6_loadbalance.c new file mode 100644 index 000000000..fe797a7f2 --- /dev/null +++ b/modules/ip6/datapath/ip6_loadbalance.c @@ -0,0 +1,73 @@ +// SPDX-License-Identifier: BSD-3-Clause +// Copyright (c) 2025 Christophe Fontaine + +#include +#include +#include +#include +#include + +#include +#include +#include + +enum edges { + OUTPUT = 0, + NO_NEXTHOP, + EDGE_COUNT, +}; + +static uint16_t ip6_loadbalance_process( + struct rte_graph *graph, + struct rte_node *node, + void **objs, + uint16_t nb_objs +) { + struct ip6_output_mbuf_data *d; + struct nexthop_info_group *g; + struct rte_mbuf *mbuf; + rte_edge_t edge; + uint16_t i; + + for (i = 0; i < nb_objs; i++) { + mbuf = objs[i]; + d = ip6_output_mbuf_data(mbuf); + g = (struct nexthop_info_group *)d->nh->info; + edge = OUTPUT; + if (unlikely(g->n_members == 0)) { + edge = NO_NEXTHOP; + goto next; + } + // TODO: increment xstat on ! mbuf->ol_flags & RTE_MBUF_F_RX_RSS_HASH + d->nh = g->members[mbuf->hash.rss % g->n_members].nh; +next: + if (gr_mbuf_is_traced(mbuf)) + gr_mbuf_trace_add(mbuf, node, 0); + + rte_node_enqueue_x1(graph, node, edge, mbuf); + } + + return nb_objs; +} + +static void loadbalance_register(void) { + ip6_output_register_nexthop_type(GR_NH_T_GROUP, "ip6_loadbalance"); +} + +static struct rte_node_register ip6_lb_node = { + .name = "ip6_loadbalance", + .process = ip6_loadbalance_process, + .nb_edges = EDGE_COUNT, + .next_nodes = { + [OUTPUT] = "ip6_output", + [NO_NEXTHOP] = "ip6_lb_no_nexthop", + }, +}; + +static struct gr_node_info info_loadbalance = { + .node = &ip6_lb_node, + .register_callback = loadbalance_register, +}; + +GR_NODE_REGISTER(info_loadbalance); +GR_DROP_REGISTER(ip6_lb_no_nexthop); diff --git a/modules/ip6/datapath/meson.build b/modules/ip6/datapath/meson.build index eb6feff7d..bc85f250e 100644 --- a/modules/ip6/datapath/meson.build +++ b/modules/ip6/datapath/meson.build @@ -10,6 +10,7 @@ src += files( 'ip6_forward.c', 'ip6_hold.c', 'ip6_input.c', + 'ip6_loadbalance.c', 'ip6_local.c', 'ip6_output.c', 'ndp_na_input.c', diff --git a/smoke/config_test.sh b/smoke/config_test.sh index 816c97224..613712a6f 100755 --- a/smoke/config_test.sh +++ b/smoke/config_test.sh @@ -16,6 +16,8 @@ grcli nexthop add l3 iface p0 address ba4:f00::1 mac ba:d0:ca:ca:00:02 grcli nexthop add l3 iface p1 address 4.3.2.1 mac ba:d0:ca:ca:00:01 grcli nexthop add blackhole id 666 grcli nexthop add reject id 123456 +grcli nexthop add group id 333 member 42 weight 102 +grcli nexthop add group id 333 member 45 member 47 grcli address add 10.0.0.1/24 iface p0 grcli address add 10.1.0.1/24 iface p1 grcli route add 0.0.0.0/0 via 10.0.0.2 @@ -40,6 +42,7 @@ grcli stats show hardware grcli nexthop del 42 grcli nexthop del 666 grcli nexthop del 123456 +grcli nexthop del 333 grcli interface del p0 grcli interface del p1 diff --git a/smoke/ip_loadbalance_frr_test.sh b/smoke/ip_loadbalance_frr_test.sh new file mode 100755 index 000000000..2657ac4d3 --- /dev/null +++ b/smoke/ip_loadbalance_frr_test.sh @@ -0,0 +1,60 @@ +#!/bin/bash +# SPDX-License-Identifier: BSD-3-Clause +# Copyright (c) 2025 Christophe Fontaine + +# +# | p1 (.1.2) +# [p0 ns0] --- grout --- | ns1 lo +# | p2 (.2.2) +# +# +# +. $(dirname $0)/_init_frr.sh + +p0=${run_id}0 +p1=${run_id}1 +p2=${run_id}2 + +create_interface $p0 f0:0d:ac:dc:00:00 +create_interface $p1 f0:0d:ac:dc:00:01 +create_interface $p2 f0:0d:ac:dc:00:02 + +netns_add n-$p0 + ip l set $p0 netns n-$p0 + ip -n n-$p0 link set $p0 address ba:d0:ca:ca:00:00 + ip -n n-$p0 link set $p0 up + ip -n n-$p0 link set lo up + ip -n n-$p0 addr add 172.16.0.2/24 dev $p0 + ip -n n-$p0 route add default via 172.16.0.1 + +netns_add n-$p1 + ip l set $p1 netns n-$p1 + ip l set $p2 netns n-$p1 + ip -n n-$p1 link set lo up + ip -n n-$p1 addr add 192.0.0.2/32 dev lo + ip -n n-$p1 link set $p1 address ba:d0:ca:ca:00:01 + ip -n n-$p1 link set $p1 up + ip -n n-$p1 addr add 172.16.1.2/24 dev $p1 + ip -n n-$p1 link set $p2 address ba:d0:ca:ca:00:02 + ip -n n-$p1 link set $p2 up + ip -n n-$p1 addr add 172.16.2.2/24 dev $p2 + +set_ip_address $p0 172.16.0.1/24 +set_ip_address $p1 172.16.1.1/24 +set_ip_address $p2 172.16.2.1/24 + +set_ip_route 192.0.0.0/24 172.16.1.2 +# Can't use set_ip_route a second time +# as the helper will look for the route +# --> Configure it manually +vtysh <<-EOF + configure terminal + ip route 192.0.0.0/24 172.16.2.2 +EOF + +ip -n n-$p1 route add default via 172.16.1.1 +ip netns exec n-$p0 ping 192.0.0.2 -i0.01 -c 3 + +ip -n n-$p1 route del default +ip -n n-$p1 route add default via 172.16.2.1 +ip netns exec n-$p0 ping 192.0.0.2 -i0.01 -c 3 diff --git a/smoke/ip_loadbalance_test.sh b/smoke/ip_loadbalance_test.sh new file mode 100755 index 000000000..0f1d6e737 --- /dev/null +++ b/smoke/ip_loadbalance_test.sh @@ -0,0 +1,56 @@ +#!/bin/bash +# SPDX-License-Identifier: BSD-3-Clause +# Copyright (c) 2025 Christophe Fontaine + +. $(dirname $0)/_init.sh + +p0=${run_id}0 +p1=${run_id}1 +p2=${run_id}2 + +grcli interface add port $p0 devargs net_tap0,iface=$p0 mac f0:0d:ac:dc:00:00 +grcli interface add port $p1 devargs net_tap1,iface=$p1 mac f0:0d:ac:dc:00:01 +grcli interface add port $p2 devargs net_tap2,iface=$p2 mac f0:0d:ac:dc:00:02 + +grcli address add 172.16.0.1/24 iface $p0 +grcli address add 172.16.1.1/24 iface $p1 +grcli address add 172.16.2.1/24 iface $p2 + +netns_add ${run_id} +ip -n ${run_id} link set lo up +ip -n ${run_id} addr add 192.200.0.2/24 dev lo + +for n in 0 1; do + p=$run_id$n + ip link set $p netns ${run_id} + ip -n ${run_id} link set $p address ba:d0:ca:ca:00:0$n + ip -n ${run_id} link set $p up + ip -n ${run_id} addr add 172.16.$n.2/24 dev $p +done + +# Add ECMP route +grcli nexthop add l3 iface $p0 address 172.16.0.2 id 100 +grcli nexthop add l3 iface $p1 address 172.16.1.2 id 101 +grcli nexthop add group id 10 member 100 member 101 +grcli route add 192.200.0.0/24 via id 10 + +# Locally generated ICMP requests +grcli ping 192.200.0.2 count 1 ident 1 +grcli ping 192.200.0.2 count 1 ident 2 + +# Externally generated ICMP requests +ip -n ${run_id} nexthop add id 1601 via 172.16.0.1 dev $p0 +ip -n ${run_id} nexthop add id 1611 via 172.16.1.1 dev $p1 +ip -n ${run_id} nexthop add id 1620 group 1601/1611 + +ip -n ${run_id} route add 172.16.2.0/24 nhid 1620 + +netns_add $p2 +ip link set $p2 netns $p2 +ip -n $p2 link set $p2 address ba:d0:ca:ca:00:02 +ip -n $p2 link set $p2 up +ip -n $p2 addr add 172.16.2.2/24 dev $p2 +ip -n $p2 route add default via 172.16.2.1 +ip netns exec $p2 ping 192.200.0.2 -c 3 + +grcli nexthop del 10 diff --git a/subprojects/packagefiles/frr/meson-add-dependency-definition.patch b/subprojects/packagefiles/frr/meson-add-dependency-definition.patch index 3b3d1ada5..29310aa9b 100644 --- a/subprojects/packagefiles/frr/meson-add-dependency-definition.patch +++ b/subprojects/packagefiles/frr/meson-add-dependency-definition.patch @@ -103,7 +103,7 @@ index 0000000..d26e979 + '"' + srcdir + '/configure" ' + + '--prefix="' + prefix + '" ' + + '--with-moduledir="' + moduledir + '" ' + -+ '--disable-doc --enable-multipath=1 ' + ++ '--disable-doc --enable-multipath=128 ' + + '--disable-ripd --disable-ripngd --disable-ospfd --disable-ospf6d ' + + '--disable-ldpd --disable-nhrpd --disable-eigrpd --disable-babeld ' + + '--disable-isisd --disable-pimd --disable-pim6d --disable-pbrd --disable-fabricd ' +