apache · jacek-lewandowski · Mar 31, 2023 · Mar 31, 2023 · Apr 3, 2023 · Apr 3, 2023
diff --git a/TODO b/TODO
@@ -0,0 +1,11 @@
+* do not include commented-out lines in the commit message (like #Conflict...)
+* validate commit message - validate the expected format and whether people can be found in github
+* let the user edit the CHANGES message, make sure it is one line message
+
+* prepare commit message
+- list of commit messages
+- * use GPT to summarize the changes
+- list all users who interacted on any of the PRs or reviewers from Jira
+- get the current user
+- prepare the message
+- open editor with tmp file containing the message
diff --git a/dev/scripts/__init__.py b/dev/scripts/__init__.py
@@ -0,0 +1,5 @@
+import os
+import sys
+
+PROJECT_PATH = os.path.dirname(os.path.abspath(__file__))
+sys.path.append(PROJECT_PATH)
diff --git a/dev/scripts/digest-circleci-workflow.py b/dev/scripts/digest-circleci-workflow.py
@@ -0,0 +1,129 @@
+# https://app.circleci.com/pipelines/github/jacek-lewandowski/cassandra/1252/workflows/b10132a7-1b4f-44d0-8808-f19a3b5fde69/jobs/63797
+# https://circleci.com/api/v2/project/gh/jacek-lewandowski/cassandra/63797/tests
+# {
+#   "items": [
+#     {
+#       "classname": "org.apache.cassandra.distributed.test.LegacyCASTest",
+#       "name": "testRepairIncompletePropose-_jdk17",
+#       "result": "success",
+#       "message": "",
+#       "run_time": 15.254,
+#       "source": "unknown"
+#     }
+#    ,{
+#       "classname": "org.apache.cassandra.distributed.test.NativeTransportEncryptionOptionsTest",
+#       "name": "testEndpointVerificationEnabledIpNotInSAN-cassandra.testtag_IS_UNDEFINED",
+#       "result": "failure",
+#       "message": "junit.framework.AssertionFailedError: Forked Java VM exited abnormally. Please note the time in the report does not reflect the time until the VM exit.\n\tat jdk.internal.reflect.GeneratedMethodAccessor4.invoke(Unknown Source)\n\tat java.base/jdk.internal.reflect.DelegatingMethodAccessorImpl.invoke(DelegatingMethodAccessorImpl.java:43)\n\tat java.base/java.util.Vector.forEach(Vector.java:1365)\n\tat jdk.internal.reflect.GeneratedMethodAccessor4.invoke(Unknown Source)\n\tat java.base/jdk.internal.reflect.DelegatingMethodAccessorImpl.invoke(DelegatingMethodAccessorImpl.java:43)\n\tat jdk.internal.reflect.GeneratedMethodAccessor4.invoke(Unknown Source)\n\tat java.base/jdk.internal.reflect.DelegatingMethodAccessorImpl.invoke(DelegatingMethodAccessorImpl.java:43)\n\tat java.base/java.util.Vector.forEach(Vector.java:1365)\n\tat jdk.internal.reflect.GeneratedMethodAccessor4.invoke(Unknown Source)\n\tat java.base/jdk.internal.reflect.DelegatingMethodAccessorImpl.invoke(DelegatingMethodAccessorImpl.java:43)\n\tat java.base/jdk.internal.reflect.NativeMethodAccessorImpl.invoke0(Native Method)\n\tat java.base/jdk.internal.reflect.NativeMethodAccessorImpl.invoke(NativeMethodAccessorImpl.java:77)\n\tat java.base/jdk.internal.reflect.DelegatingMethodAccessorImpl.invoke(DelegatingMethodAccessorImpl.java:43)\n\tat java.base/java.util.Vector.forEach(Vector.java:1365)\n\tat java.base/jdk.internal.reflect.NativeMethodAccessorImpl.invoke0(Native Method)\n\tat java.base/jdk.internal.reflect.NativeMethodAccessorImpl.invoke(NativeMethodAccessorImpl.java:77)\n\tat java.base/jdk.internal.reflect.DelegatingMethodAccessorImpl.invoke(DelegatingMethodAccessorImpl.java:43)\n\tat org.apache.cassandra.anttasks.TestHelper.execute(TestHelper.java:53)\n\tat jdk.internal.reflect.GeneratedMethodAccessor4.invoke(Unknown Source)\n\tat java.base/jdk.internal.reflect.DelegatingMethodAccessorImpl.invoke(DelegatingMethodAccessorImpl.java:43)\n\tat java.base/java.util.Vector.forEach(Vector.java:1365)\n\tat jdk.internal.reflect.GeneratedMethodAccessor4.invoke(Unknown Source)\n\tat java.base/jdk.internal.reflect.DelegatingMethodAccessorImpl.invoke(DelegatingMethodAccessorImpl.java:43)\n\tat jdk.internal.reflect.GeneratedMethodAccessor4.invoke(Unknown Source)\n\tat java.base/jdk.internal.reflect.DelegatingMethodAccessorImpl.invoke(DelegatingMethodAccessorImpl.java:43)",
+#       "run_time": 0.001,
+#       "source": "unknown"
+#     }
+#   ]
+# }
+import csv
+
+# So here is the plan:
+# I have a link to the pipeline: https://app.circleci.com/pipelines/github/jacek-lewandowski/cassandra/1252
+# The program goes through all the workflow jobs and list the failed tests along with the workflow, job, etc.
+# Then:
+# - separate failures into 3 groups:
+# 1. flaky - if a test was repeated in mulitple jobs and failred in some of them
+# 2. failure - if a test was repeated in multiple jobs and failed in all of them
+# 3. suspected - if a test was not repeated
+
+# Then for each failure list Jira tickets that mention the test name.
+
+# Having that information, let the user decide what to do with each failure:
+# - select a jira ticket
+# - create a new ticket
+# - do not associate with any ticket
+# - report on the PR
+
+# Eventually, the user can create the script which can perform the planned operations
+
+from lib.circleci_utils import *
+
+class TestFailure(NamedTuple):
+    file: str
+    classname: str
+    name: str
+    jobs_comp: str
+    jobs_list: list
+
+class TestFailureComparison(NamedTuple):
+    file: str
+    classname: str
+    name: str
+    feature_jobs: set
+    base_jobs: set
+    jobs_comp: str
+
+if len(sys.argv) != 4 and len(sys.argv) != 6:
+    print("Usage: %s <repo> <workflow_id> <output.csv>" % sys.argv[0])
+    print("Usage: %s <feature repo> <feature workflow id > <base repo> <base workflow id> <output.csv>" % sys.argv[0])
+    sys.exit(1)
+
+if len(sys.argv) == 4:
+    repo = sys.argv[1]
+    workflow_id = sys.argv[2]
+    output_file = sys.argv[3]
+    failed_tests_dict = get_failed_tests(repo, workflow_id)
+    failed_tests = []
+    for file in failed_tests_dict:
+        for classname in failed_tests_dict[file]:
+            for name in failed_tests_dict[file][classname]:
+                jobs = list(failed_tests_dict[file][classname][name])
+                jobs.sort()
+                failed_tests.append(TestFailure(file, classname, name, ",".join(failed_tests_dict[file][classname][name]), jobs))
+
+    # sort failed tests by jobs, file, classname, name
+    failed_tests.sort(key=lambda test: (test.jobs_comp, test.file, test.classname, test.name))
+
+    # save failed_tests to csv file
+    with open(output_file, 'w') as csvfile:
+        writer = csv.writer(csvfile)
+        writer.writerow(['file', 'classname', 'name', 'jobs'])
+        for test in failed_tests:
+            writer.writerow([test.file, test.classname, test.name, test.jobs_comp])
+
+else:
+    feature_repo = sys.argv[1]
+    feature_workflow_id = sys.argv[2]
+    base_repo = sys.argv[3]
+    base_workflow_id = sys.argv[4]
+    output_file = sys.argv[5]
+    feature_failed_tests_dict = get_failed_tests(feature_repo, feature_workflow_id)
+    base_failed_tests_dict = get_failed_tests(base_repo, base_workflow_id)
+
+    failed_tests = []
+    all_files = set(feature_failed_tests_dict.keys()).union(set(base_failed_tests_dict.keys()))
+    for file in all_files:
+        feature_classnames = feature_failed_tests_dict[file] if file in feature_failed_tests_dict else {}
+        base_classnames = base_failed_tests_dict[file] if file in base_failed_tests_dict else {}
+        all_classnames = set(feature_classnames.keys()).union(set(base_classnames.keys()))
+        for classname in all_classnames:
+            feature_names = feature_classnames[classname] if classname in feature_classnames else {}
+            base_names = base_classnames[classname] if classname in base_classnames else {}
+            all_names = set(feature_names.keys()).union(set(base_names.keys()))
+            for name in all_names:
+                feature_jobs = feature_names[name] if name in feature_names else set()
+                base_jobs = base_names[name] if name in base_names else set()
+                jobs_comp = list(feature_jobs.union(base_jobs))
+                jobs_comp.sort()
+                failed_tests.append(TestFailureComparison(file, classname, name, feature_jobs, base_jobs, ",".join(jobs_comp)))
+
+    # sort failed tests by jobs, file, classname, name
+    failed_tests.sort(key=lambda test: (test.jobs_comp, test.file, test.classname, test.name))
+
+    # save failed_tests to csv file
+    with open(output_file, 'w') as csvfile:
+        writer = csv.writer(csvfile)
+        writer.writerow(['file', 'classname', 'name', 'failed in feature only', 'failed in base only', 'failed in both'])
+        for test in failed_tests:
+            feature_only_jobs = list(test.feature_jobs.difference(test.base_jobs))
+            feature_only_jobs.sort()
+            base_only_jobs = list(test.base_jobs.difference(test.feature_jobs))
+            base_only_jobs.sort()
+            common_jobs = list(test.feature_jobs.intersection(test.base_jobs))
+            common_jobs.sort()
+            writer.writerow([test.file, test.classname, test.name, ",".join(feature_only_jobs), ",".join(base_only_jobs), ",".join(common_jobs)])
diff --git a/dev/scripts/lib/circleci_utils.py b/dev/scripts/lib/circleci_utils.py
@@ -0,0 +1,128 @@
+import json
+import sys
+from enum import Enum
+from typing import NamedTuple
+
+import urllib3
+
+class PipelineInfo(NamedTuple):
+    id: str
+    number: int
+
+def get_pipelines_from_circleci(repo, branch):
+    http = urllib3.PoolManager()
+    url = "https://circleci.com/api/v2/project/gh/%s/cassandra/pipeline?branch=%s" % (repo, branch)
+    r = http.request('GET', url)
+    if r.status == 200:
+        items = json.loads(r.data.decode('utf-8'))['items']
+        return [PipelineInfo(id=item['id'], number=item['number']) for item in items]
+    return None
+
+class WorkflowInfo(NamedTuple):
+    id: str
+    name: str
+    status: str
+
+def get_pipeline_workflows(pipeline_id):
+    http = urllib3.PoolManager()
+    url = "https://circleci.com/api/v2/pipeline/%s/workflow" % (pipeline_id)
+    r = http.request('GET', url)
+    if r.status == 200:
+        items = json.loads(r.data.decode('utf-8'))['items']
+        return [WorkflowInfo(id=item['id'], name=item['name'], status=item['status']) for item in items]
+
+class JobType(Enum):
+    BUILD = "build"
+    APPROVAL = "approval"
+
+class JobStatus(Enum):
+    SUCCESS = "success"
+    RUNNING = "running"
+    NOT_RUN = "not_run"
+    FAILED = "failed"
+    RETRIED = "retried"
+    QUEUED = "queued"
+    NOT_RUNNING = "not_running"
+    INFRASTRUCTURE_FAIL = "infrastructure_fail"
+    TIMEDOUT = "timedout"
+    ON_HOLD = "on_hold"
+    TERMINATED_UNKNOWN = "terminated-unknown"
+    BLOCKED = "blocked"
+    CANCELED = "canceled"
+    UNAUTHORIZED = "unauthorized"
+
+class JobInfo(NamedTuple):
+    id: str
+    name: str
+    status: JobStatus
+    job_number: str
+    type: JobType
+
+def job_info_from_json(json):
+    return JobInfo(id=json['id'], name=json['name'], status=JobStatus(json['status']), job_number=json['job_number'] if 'job_number' in json else None , type=JobType(json['type']))
+
+def get_workflow_jobs(workflow_id):
+    http = urllib3.PoolManager()
+    url = "https://circleci.com/api/v2/workflow/%s/job" % (workflow_id)
+    r = http.request('GET', url)
+    if r.status == 200:
+        items = json.loads(r.data.decode('utf-8'))['items']
+        print("Found %d jobs" % len(items))
+        return [job_info_from_json(item) for item in items]
+    return None
+
+def get_failed_jobs(workflow_id):
+    jobs = get_workflow_jobs(workflow_id)
+    failed_jobs = []
+    for job in jobs:
+        if job.status == JobStatus.FAILED and job.job_number is not None:
+            failed_jobs.append(job)
+        else:
+            print("Skipping job %s" % str(job))
+    return failed_jobs
+
+class TestResult(Enum):
+    SUCCESS = "success"
+    FAILURE = "failure"
+    SKIPPED = "skipped"
+    ERROR = "error"
+    UNKNOWN = "unknown"
+
+class TestInfo(NamedTuple):
+    message: str
+    source: str
+    run_time: float
+    file: str
+    result: TestResult
+    name: str
+    classname: str
+
+def get_job_tests(repo, job_number):
+    http = urllib3.PoolManager()
+    url = "https://circleci.com/api/v2/project/gh/%s/cassandra/%s/tests" % (repo, job_number)
+    r = http.request('GET', url)
+    if r.status == 200:
+        tests = [TestInfo(t['message'], t['source'], t['run_time'], t['file'] if 'file' in t else "", TestResult(t['result']), t['name'], t['classname']) for t in json.loads(r.data.decode('utf-8'))['items']]
+        return tests
+    return None
+
+
+def get_failed_tests(repo, workflow_id):
+    failed_jobs = get_failed_jobs(workflow_id)
+    failed_tests = {}
+    for job in failed_jobs:
+        print("Getting tests for job %s" % str(job))
+        tests = get_job_tests(repo, job.job_number)
+        for test in tests:
+            if test.result == TestResult.FAILURE:
+                if test.file not in failed_tests:
+                    failed_tests[test.file] = {}
+                if test.classname not in failed_tests[test.file]:
+                    failed_tests[test.file][test.classname] = {}
+                test_name = test.name.split("-", 2)[0]
+                test_name = test_name.split("[", 2)[0]
+                if test_name not in failed_tests[test.file][test.classname]:
+                    failed_tests[test.file][test.classname][test_name] = set()
+                failed_tests[test.file][test.classname][test_name].add(job.name)
+
+    return failed_tests