Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

prov/tcp: implement FI_FIREWALL_ADDR for AV inserts #10724

Draft
wants to merge 2 commits into
base: main
Choose a base branch
from
Draft
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
4 changes: 3 additions & 1 deletion include/ofi_util.h
Original file line number Diff line number Diff line change
Expand Up @@ -923,14 +923,16 @@ struct util_av_attr {
*/
struct util_peer_addr {
struct rxm_av *av;
uint64_t av_flags;
fi_addr_t fi_addr;
struct ofi_rbnode *node;
int index;
int refcnt;
union ofi_sock_ip addr;
};

struct util_peer_addr *util_get_peer(struct rxm_av *av, const void *addr);
int util_get_peer(struct rxm_av *av, const void *addr, struct util_peer_addr **peer,
uint64_t flags);
void util_put_peer(struct util_peer_addr *peer);

/* All peer addresses, whether they've been inserted into the AV
Expand Down
1 change: 1 addition & 0 deletions include/rdma/fabric.h
Original file line number Diff line number Diff line change
Expand Up @@ -159,6 +159,7 @@ typedef struct fid *fid_t;
#define FI_PEER_TRANSFER (1ULL << 36)
/* #define FI_MR_DMABUF (1ULL << 40) */
#define FI_AV_USER_ID (1ULL << 41)
#define FI_FIREWALL_ADDR (1ULL << 42)
#define FI_PEER (1ULL << 43)
/* #define FI_XPU_TRIGGER (1ULL << 44) */

Expand Down
10 changes: 10 additions & 0 deletions man/fi_av.3.md
Original file line number Diff line number Diff line change
Expand Up @@ -436,6 +436,16 @@ fi_av_set_user_id.

See the user ID section below.

- *FI_FIREWALL_ADDR*
: This flag indicates that the address is behind a firewall and outgoing
connections are not allowed. If there is not an existing connection and the
provider is unable to circumvent the firewall, an FI_EHOSTUNREACH error
should be expected. If multiple addresses are being inserted simultaneously,
the flag applies to all of them. Additionally, it is possible that a
connection is available at insertion time, but is later torn down. Future
reconnects triggered by operations on the ep (fi_send, for example) may also
fail with the same error.

## fi_av_insertsvc

The fi_av_insertsvc call behaves similar to fi_av_insert, but allows the
Expand Down
9 changes: 6 additions & 3 deletions prov/rxm/src/rxm_conn.c
Original file line number Diff line number Diff line change
Expand Up @@ -460,6 +460,9 @@ ssize_t rxm_get_conn(struct rxm_ep *ep, fi_addr_t addr, struct rxm_conn **conn)
return 0;
}

if ((*peer)->av_flags & FI_FIREWALL_ADDR)
return -FI_EHOSTUNREACH;

ret = rxm_connect(*conn);

/* If the progress function encounters an error trying to establish
Expand Down Expand Up @@ -657,9 +660,9 @@ rxm_process_connreq(struct rxm_ep *ep, struct rxm_eq_cm_entry *cm_entry)
ofi_addr_set_port(&peer_addr.sa, cm_entry->data.connect.port);

av = container_of(ep->util_ep.av, struct rxm_av, util_av);
peer = util_get_peer(av, &peer_addr);
if (!peer) {
RXM_WARN_ERR(FI_LOG_EP_CTRL, "util_get_peer", -FI_ENOMEM);
ret = util_get_peer(av, &peer_addr, &peer, 0);
if (ret) {
RXM_WARN_ERR(FI_LOG_EP_CTRL, "util_get_peer", ret);
goto reject;
}

Expand Down
8 changes: 5 additions & 3 deletions prov/tcp/src/xnet_rdm_cm.c
Original file line number Diff line number Diff line change
Expand Up @@ -358,6 +358,8 @@ ssize_t xnet_get_conn(struct xnet_rdm *rdm, fi_addr_t addr,
return -FI_ENOMEM;

if (!(*conn)->ep) {
if ((*peer)->av_flags & FI_FIREWALL_ADDR)
return -FI_EHOSTUNREACH;
ret = xnet_rdm_connect(*conn);
if (ret)
return ret;
Expand Down Expand Up @@ -438,9 +440,9 @@ static void xnet_process_connreq(struct fi_eq_cm_entry *cm_entry)
ofi_addr_set_port(&peer_addr.sa, ntohs(msg->port));

av = container_of(rdm->util_ep.av, struct rxm_av, util_av);
peer = util_get_peer(av, &peer_addr);
if (!peer) {
XNET_WARN_ERR(FI_LOG_EP_CTRL, "util_get_peer", -FI_ENOMEM);
ret = util_get_peer(av, &peer_addr, &peer, 0);
if (ret) {
XNET_WARN_ERR(FI_LOG_EP_CTRL, "util_get_peer", ret);
goto reject;
}

Expand Down
36 changes: 21 additions & 15 deletions prov/util/src/rxm_av.c
Original file line number Diff line number Diff line change
Expand Up @@ -98,23 +98,29 @@ static void rxm_free_peer(struct util_peer_addr *peer)
ofi_ibuf_free(peer);
}

struct util_peer_addr *
util_get_peer(struct rxm_av *av, const void *addr)

int util_get_peer(struct rxm_av *av, const void *addr, struct util_peer_addr **peer,
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

nit: check line length

uint64_t flags)
{
struct util_peer_addr *peer;
struct ofi_rbnode *node;
int ret;

ofi_mutex_lock(&av->util_av.lock);
node = ofi_rbmap_find(&av->addr_map, (void *) addr);
if (node) {
peer = node->data;
peer->refcnt++;
*peer = node->data;
(*peer)->refcnt++;
shefty marked this conversation as resolved.
Show resolved Hide resolved
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Set ret = FI_SUCCESS here to avoid an extra branch on return.

Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Something is off in the flag settings. A peer might be inserted in response to a connection request (xnet_process_connreq). In that case, peer->av_flags = 0. When an address is inserted with FI_FIREWALL_ADDR, the peer will be found, but peer->av_flags won't be adjusted.

I think you need (*peer)->av_flags |= flags; here.

} else if (flags & FI_FIREWALL_ADDR) {
*peer = NULL;
ret = -FI_EHOSTUNREACH;
} else {
peer = rxm_alloc_peer(av, addr);
*peer = rxm_alloc_peer(av, addr);
if (!*peer)
ret = -FI_ENOMEM;
(*peer)->av_flags = flags;
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Needs to be part of an else, or this results in a crash.

}

ofi_mutex_unlock(&av->util_av.lock);
return peer;
return peer ? FI_SUCCESS : ret;
}

static void util_deref_peer(struct util_peer_addr *peer)
Expand Down Expand Up @@ -165,17 +171,17 @@ rxm_put_peer_addr(struct rxm_av *av, fi_addr_t fi_addr)

static int
rxm_av_add_peers(struct rxm_av *av, const void *addr, size_t count,
fi_addr_t *fi_addr, fi_addr_t *user_ids)
fi_addr_t *fi_addr, fi_addr_t *user_ids, uint64_t flags)
{
struct util_peer_addr *peer;
const void *cur_addr;
fi_addr_t cur_fi_addr;
size_t i;
size_t i, ret;

for (i = 0; i < count; i++) {
cur_addr = ((char *) addr + i * av->util_av.addrlen);
peer = util_get_peer(av, cur_addr);
if (!peer)
ret = util_get_peer(av, cur_addr, &peer, flags);
if (ret)
goto err;

if (user_ids) {
Expand Down Expand Up @@ -206,7 +212,7 @@ rxm_av_add_peers(struct rxm_av *av, const void *addr, size_t count,
ofi_mutex_unlock(&av->util_av.lock);
}
}
return -FI_ENOMEM;
return ret;
}

static int rxm_av_remove(struct fid_av *av_fid, fi_addr_t *fi_addr,
Expand Down Expand Up @@ -299,7 +305,7 @@ static int rxm_av_insert(struct fid_av *av_fid, const void *addr, size_t count,

count = ret;

ret = rxm_av_add_peers(av, addr, count, fi_addr, user_ids);
ret = rxm_av_add_peers(av, addr, count, fi_addr, user_ids, flags);
if (ret) {
rxm_av_remove(av_fid, fi_addr, count, flags);
goto out;
Expand Down Expand Up @@ -345,7 +351,7 @@ static int rxm_av_insertsym(struct fid_av *av_fid, const char *node,
if (ret > 0 && ret < count)
count = ret;

ret = rxm_av_add_peers(av, addr, count, fi_addr, NULL);
ret = rxm_av_add_peers(av, addr, count, fi_addr, NULL, flags);
if (ret) {
rxm_av_remove(av_fid, fi_addr, count, flags);
return ret;
Expand Down