Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Fix Google Bucket Job #2686

Open
wants to merge 6 commits into
base: master
Choose a base branch
from
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
78 changes: 1 addition & 77 deletions .secrets.baseline
Original file line number Diff line number Diff line change
Expand Up @@ -1275,29 +1275,6 @@
"line_number": 33
}
],
"kube/services/jobs/aws-bucket-replicate-job.yaml": [
{
"type": "Secret Keyword",
"filename": "kube/services/jobs/aws-bucket-replicate-job.yaml",
"hashed_secret": "deb02468778f4041fb189654698ac948e436732d",
"is_verified": false,
"line_number": 33
},
{
"type": "Secret Keyword",
"filename": "kube/services/jobs/aws-bucket-replicate-job.yaml",
"hashed_secret": "abe72fcb190ed9c73eb20e198c73a97605b95063",
"is_verified": false,
"line_number": 36
},
{
"type": "Secret Keyword",
"filename": "kube/services/jobs/aws-bucket-replicate-job.yaml",
"hashed_secret": "ca3cdac59f2bfa45cb014190e4509bf6becf28fb",
"is_verified": false,
"line_number": 42
}
],
"kube/services/jobs/bucket-manifest-job.yaml": [
{
"type": "Secret Keyword",
Expand Down Expand Up @@ -1703,29 +1680,6 @@
"line_number": 31
}
],
"kube/services/jobs/google-bucket-replicate-job.yaml": [
{
"type": "Secret Keyword",
"filename": "kube/services/jobs/google-bucket-replicate-job.yaml",
"hashed_secret": "b6f0ec0b08da77656ced48427841e28d7a8a81d6",
"is_verified": false,
"line_number": 35
},
{
"type": "Secret Keyword",
"filename": "kube/services/jobs/google-bucket-replicate-job.yaml",
"hashed_secret": "abe72fcb190ed9c73eb20e198c73a97605b95063",
"is_verified": false,
"line_number": 38
},
{
"type": "Secret Keyword",
"filename": "kube/services/jobs/google-bucket-replicate-job.yaml",
"hashed_secret": "ca3cdac59f2bfa45cb014190e4509bf6becf28fb",
"is_verified": false,
"line_number": 41
}
],
"kube/services/jobs/google-create-bucket-job.yaml": [
{
"type": "Secret Keyword",
Expand Down Expand Up @@ -2373,36 +2327,6 @@
"line_number": 43
}
],
"kube/services/jobs/replicate-validation-job.yaml": [
{
"type": "Secret Keyword",
"filename": "kube/services/jobs/replicate-validation-job.yaml",
"hashed_secret": "deb02468778f4041fb189654698ac948e436732d",
"is_verified": false,
"line_number": 34
},
{
"type": "Secret Keyword",
"filename": "kube/services/jobs/replicate-validation-job.yaml",
"hashed_secret": "b6f0ec0b08da77656ced48427841e28d7a8a81d6",
"is_verified": false,
"line_number": 37
},
{
"type": "Secret Keyword",
"filename": "kube/services/jobs/replicate-validation-job.yaml",
"hashed_secret": "abe72fcb190ed9c73eb20e198c73a97605b95063",
"is_verified": false,
"line_number": 40
},
{
"type": "Secret Keyword",
"filename": "kube/services/jobs/replicate-validation-job.yaml",
"hashed_secret": "ca3cdac59f2bfa45cb014190e4509bf6becf28fb",
"is_verified": false,
"line_number": 43
}
],
"kube/services/jobs/s3sync-cronjob.yaml": [
{
"type": "Secret Keyword",
Expand Down Expand Up @@ -3253,5 +3177,5 @@
}
]
},
"generated_at": "2024-11-25T11:39:59Z"
"generated_at": "2024-12-13T06:58:14Z"
}
2 changes: 1 addition & 1 deletion Docker/awshelper/Dockerfile
Original file line number Diff line number Diff line change
Expand Up @@ -126,4 +126,4 @@ RUN git config --global user.email gen3 \
RUN export GEN3_HOME="$HOME/cloud-automation" \
&& bash -c 'source "$GEN3_HOME/gen3/gen3setup.sh" && gen3 help'

CMD /bin/bash
CMD ["/bin/bash"]
4 changes: 2 additions & 2 deletions kube/services/jobs/aws-bucket-replicate-job.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -88,13 +88,13 @@ spec:
args:
- "-c"
- |
cat /secrets/dcf_dataservice_settings > ./scripts/settings.py
cat /secrets/dcf_dataservice_settings > ./dcfdataservice/settings.py
echo """
[default]
region: us-east-1
""" > ~/.aws/config
aws configure set default.s3.max_concurrent_requests 1000
aws configure set default.s3.max_queue_size 10000
python replicate.py aws_replicate --release $RELEASE --quick_test $QUICK_TEST --bucket $GDC_BUCKET_NAME --thread_num $THREAD_NUM --manifest_file $MANIFEST_S3 --global_config "{\"chunk_size\": $CHUNK_SIZE, \"log_bucket\": \"$LOG_BUCKET\"}"
# python scripts/replicate.py indexing --thread_num 20 --manifest_file $MANIFEST_S3 --global_config "{\"chunk_size\": 3, \"log_bucket\": \"$LOG_BUCKET\"}"
# python dcfdataservice/replicate.py indexing --thread_num 20 --manifest_file $MANIFEST_S3 --global_config "{\"chunk_size\": 3, \"log_bucket\": \"$LOG_BUCKET\"}"
restartPolicy: Never
10 changes: 8 additions & 2 deletions kube/services/jobs/google-bucket-replicate-job.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -39,6 +39,9 @@ spec:
- name: creds-json-volume
secret:
secretName: "dcf-dataservice-json-secret"
- name: project-map-volume
configMap:
name: project-map-manifest
containers:
- name: datareplicate
GEN3_DATAREPLICATE_IMAGE
Expand Down Expand Up @@ -78,11 +81,14 @@ spec:
- name: "creds-json-volume"
mountPath: "/secrets/dcf_dataservice_credentials.json"
subPath: "dcf_dataservice_credentials.json"
- name: "project-map-volume"
mountPath: "/dcf-dataservice/GDC_project_map.json"
subPath: "GDC_project_map.json"
command: ["/bin/bash" ]
args:
- "-c"
- |
cat /secrets/dcf_dataservice_settings > ./scripts/settings.py
cat /secrets/dcf_dataservice_settings > ./dcfdataservice/settings.py
gcloud auth activate-service-account --key-file=/secrets/google_service_account_creds
export GOOGLE_APPLICATION_CREDENTIALS=/secrets/google_service_account_creds
export http_proxy='http://cloud-proxy.internal.io:3128'
Expand All @@ -97,7 +103,7 @@ spec:
fi
if [[ "$type" == "active" || "$type" == "legacy" ]]; then
rand_str="$(cat /dev/urandom | tr -dc 'a-zA-Z0-9' | fold -w 5 | head -n 1)"
python dataflow_pipeline.py --runner DataflowRunner --project $PROJECT --job_name dcf-dataservice --autoscaling_algorithm NONE --num_worker $MAX_WORKERS --maxNumWorkers $MAX_WORKERS --staging_location gs://$LOG_BUCKET/$RELEASE/staging --temp_location gs://$LOG_BUCKET/$RELEASE/temp --output gs://$LOG_BUCKET/$RELEASE/$type/output_$rand_str --setup_file ./setup.py --input $MANIFEST_FILE --global_config "{\"release\": \"$RELEASE/$type\", \"log_bucket\": \"$LOG_BUCKET\"}" --requirements_file requirements.txt --extra_package indexclient-1.6.0.zip --requirements_file scripts/requirements.txt
python dataflow_pipeline.py --runner DataflowRunner --project $PROJECT --job_name dcf-dataservice --autoscaling_algorithm NONE --num_worker $MAX_WORKERS --maxNumWorkers $MAX_WORKERS --staging_location gs://$LOG_BUCKET/$RELEASE/staging --temp_location gs://$LOG_BUCKET/$RELEASE/temp --output gs://$LOG_BUCKET/$RELEASE/$type/output_$rand_str --setup_file ./setup.py --input $MANIFEST_FILE --global_config "{\"release\": \"$RELEASE/$type\", \"log_bucket\": \"$LOG_BUCKET\"}" --requirements_file requirements.txt --extra_package indexclient-1.6.0.zip --requirements_file dcfdataservice/requirements.txt
else
echo "Neither active nor legacy manifest is provided. Please check the manifest name!!!"
fi
Expand Down
13 changes: 10 additions & 3 deletions kube/services/jobs/replicate-validation-job.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -41,6 +41,9 @@ spec:
- name: creds-json-volume
secret:
secretName: "dcf-dataservice-json-secret"
- name: project-map-volume
configMap:
name: project-map-manifest
containers:
- name: datareplicate
GEN3_DATAREPLICATE_IMAGE
Expand All @@ -64,6 +67,8 @@ spec:
GEN3_FORCE_CREATE_MANIFEST
- name: LOG_BUCKET
GEN3_LOG_BUCKET
- name: MAP_FILE
value: "" # Default to empty string if not provided
volumeMounts:
- name: aws-cred-volume
mountPath: "/root/.aws/credentials"
Expand All @@ -76,7 +81,9 @@ spec:
subPath: "dcf_dataservice_settings"
- name: "creds-json-volume"
mountPath: "/secrets/dcf_dataservice_credentials.json"
subPath: "dcf_dataservice_credentials.json"
- name: "project-map-volume"
mountPath: "/dcf-dataservice/GDC_project_map.json"
subPath: "GDC_project_map.json"
command: ["/bin/bash" ]
args:
- "-c"
Expand All @@ -85,9 +92,9 @@ spec:
[default]
region: us-east-1
""" > ~/.aws/config
cat /secrets/dcf_dataservice_settings > ./scripts/settings.py
cat /secrets/dcf_dataservice_settings > ./dcfdataservice/settings.py
gcloud auth activate-service-account --key-file=/secrets/google_service_account_creds
export GOOGLE_APPLICATION_CREDENTIALS=/secrets/google_service_account_creds
gsutil cp $IGNORED_FILE /dcf-dataservice/ignored_files_manifest.csv
python replicate.py validate --global_config "{\"release\": \"$RELEASE\", \"manifest_files\":\"$MANIFEST_FILES\", \"out_manifests\": \"$OUT_FILES\", \"FORCE_CREATE_MANIFEST\": \"$FORCE_CREATE_MANIFEST\", \"log_bucket\": \"$LOG_BUCKET\", \"save_copied_objects\": 1}"
python replicate.py validate --global_config "{\"release\": \"$RELEASE\", \"manifest_files\":\"$MANIFEST_FILES\", \"out_manifests\": \"$OUT_FILES\", \"FORCE_CREATE_MANIFEST\": \"$FORCE_CREATE_MANIFEST\", \"log_bucket\": \"$LOG_BUCKET\", \"map_file\": \"$MAP_FILE\", \"save_copied_objects\": 1}"
restartPolicy: Never
Loading