From 99398aa60b16ef01033b6aa784cad837ab690b4b Mon Sep 17 00:00:00 2001 From: taloric Date: Thu, 31 Oct 2024 18:56:56 +0800 Subject: [PATCH] fix: add x_request_id for server_span match --- app/app/application/l7_flow_tracing.py | 68 ++++++++++++++++++++++---- 1 file changed, 59 insertions(+), 9 deletions(-) diff --git a/app/app/application/l7_flow_tracing.py b/app/app/application/l7_flow_tracing.py index e02f6fe..dea548d 100644 --- a/app/app/application/l7_flow_tracing.py +++ b/app/app/application/l7_flow_tracing.py @@ -1547,8 +1547,11 @@ def __init__(self, group_key: str): self.app_span_roots: List[SpanNode] = None # 用于存放 `app_span` 的所有 leaf self.app_span_leafs: List[SpanNode] = None + # 记录叶子节点的 syscall_trace_id, 用以匹配 s-p root self.leaf_syscall_trace_id_request: Set[int] = set() self.leaf_syscall_trace_id_response: Set[int] = set() + # 记录叶子节点的 x_request_id => index (in self.spans), 用以匹配 s-p root + self.leaf_x_request_id: Dict[str, List[int]] = {} # 用于显示调用拓扑使用 self.subnet_id = None self.subnet = None @@ -1659,10 +1662,23 @@ def append_sys_span(self, sys_span: SysSpanNode): self._set_extra_value_for_sys_span(sys_span) self._set_auto_service(sys_span) if sys_span.tap_side == TAP_SIDE_CLIENT_PROCESS: - self.leaf_syscall_trace_id_request.add( - sys_span.get_syscall_trace_id_request()) - self.leaf_syscall_trace_id_response.add( - sys_span.get_syscall_trace_id_response()) + cp_syscall_trace_id_req = sys_span.get_syscall_trace_id_request() + cp_syscall_trace_id_res = sys_span.get_syscall_trace_id_response() + cp_x_request_id_0 = sys_span.get_x_request_id_0() + cp_x_request_id_1 = sys_span.get_x_request_id_1() + if cp_syscall_trace_id_req: + self.leaf_syscall_trace_id_request.add( + sys_span.get_syscall_trace_id_request()) + if cp_syscall_trace_id_res: + self.leaf_syscall_trace_id_response.add( + sys_span.get_syscall_trace_id_response()) + if cp_x_request_id_0: + # index of sys_span = len(self.spans)-1 + self.leaf_x_request_id.setdefault( + cp_x_request_id_0, []).append(len(self.spans) - 1) + if cp_x_request_id_1 and cp_x_request_id_1 != cp_x_request_id_0: + self.leaf_x_request_id.setdefault( + cp_x_request_id_1, []).append(len(self.spans) - 1) def remove_server_sys_span(self, sys_span: SysSpanNode): # 这里应该要做 append_sys_span 的逆操作(但对象仅为 ServerProcess sys_span) @@ -1743,7 +1759,7 @@ def split_to_multiple_process_span_set(self) -> list: # 极端情况下可能会有多个没有 parent_span_id 的入口,这里没法分辨它们的关系,不做拆分 if root_parent_span_id == '': root_parent_span_id = "root" # 只是标记 root_parent_span_id,没有实际作用 - if root_parent_span_id not in split_result: + if split_result.get(root_parent_span_id, None) is None: newSet = ProcessSpanSet(root_parent_span_id) newSet.app_span_roots = [self.spans[root_span_index]] newSet._copy_meta_data_from(self) @@ -1776,7 +1792,12 @@ def attach_sys_span_via_app_span(self, sys_span: SysSpanNode) -> bool: return self._attach_client_sys_span(sys_span) def _attach_server_sys_span(self, sys_span: SysSpanNode) -> bool: + # connection priority: span_id > syscall_trace_id > x_request_id span_id_of_sys_span = sys_span.get_span_id() + syscall_trace_id_request = sys_span.get_syscall_trace_id_request() + syscall_trace_id_response = sys_span.get_syscall_trace_id_response() + x_request_id_0 = sys_span.get_x_request_id_0() + x_request_id_1 = sys_span.get_x_request_id_1() if span_id_of_sys_span: for app_root in self.app_span_roots: if span_id_of_sys_span == app_root.get_parent_span_id(): @@ -1815,10 +1836,9 @@ def _attach_server_sys_span(self, sys_span: SysSpanNode) -> bool: "s-p sys_span mounted due to same span_id as parent", self.mounted_callback) return True - else: - syscall_trace_id_request = sys_span.get_syscall_trace_id_request() - syscall_trace_id_response = sys_span.get_syscall_trace_id_response( - ) + + # span_id not matched, try syscall_trace_id + if syscall_trace_id_request or syscall_trace_id_response: for app_root in self.app_span_roots: # 如果 span_id 不存在,说明可能是入口 span,上游没有注入 span_id,此时根据叶子节点 c-p 的 syscall_trace_id 匹配即可 # 这里匹配可以严格点,s-p 和 c-p 只会同侧(req-req / res-res)相等,避免误关联一个独立的 c-p @@ -1830,6 +1850,36 @@ def _attach_server_sys_span(self, sys_span: SysSpanNode) -> bool: "s-p sys_span mounted due to syscall_trace_id matched c-p", self.mounted_callback) return True + + # span_id/syscall not matched, try x_request_id + if x_request_id_0 or x_request_id_1: + # 场景:过 ingress/nginx 进入服务网关/服务,传递了 x_request_id,且作为首个 span 没有 trace_id/span_id + # 且发生跨线程调度,无法基于 syscall 关联时,允许通过 s-p.x_request_id(0/1) <=> c-p.x_request_id(0/1) 关联 + # 此处已确保 auto_instance_id 一致 (即同一个进程) + + # x_req_id 同侧相等: 透传 x_req_id,来自上游 + # x_req_id 异侧相等: 注入 x_req_id,内部产生 + x_req_id_matched = False + # 同一个进程内时间一定覆盖 + for same_xreqid_idx in self.leaf_x_request_id.get( + x_request_id_0, []): + if sys_span.time_range_cover(self.spans[same_xreqid_idx]): + x_req_id_matched = True + if not x_req_id_matched: + for same_xreqid_idx in self.leaf_x_request_id.get( + x_request_id_1, []): + if sys_span.time_range_cover(self.spans[same_xreqid_idx]): + x_req_id_matched = True + if x_req_id_matched: + for app_root in self.app_span_roots: + if app_root.get_parent_id() < 0: + self.append_sys_span(sys_span) + app_root.set_parent( + sys_span, + "s-p sys_span mounted due to x_request_id matched c-p", + self.mounted_callback) + return True + return False def _attach_client_sys_span(self, sys_span: SysSpanNode) -> bool: