Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

New branch back please check. #1140

Open
wants to merge 47 commits into
base: master
Choose a base branch
from
Open
Show file tree
Hide file tree
Changes from all commits
Commits
Show all changes
47 commits
Select commit Hold shift + click to select a range
9aafa71
added sup_cumulative branch changes
KunalTiwary Sep 23, 2024
715762d
minor changes in count
KunalTiwary Sep 23, 2024
d510c26
added schedule_update_SpeechConversation
KunalTiwary Sep 30, 2024
4256239
created a proxy endpoint for xlit api
kartikvirendrar Oct 9, 2024
394b169
Merge pull request #1123 from AI4Bharat/xlit-proxy-api
ishvindersethi22 Oct 10, 2024
30a62cb
added fix for ac_enabled_stage
KunalTiwary Oct 28, 2024
085b3ea
Merge pull request #1125 from AI4Bharat/minor_fix_ac_en_stage_bbm
aparna-aa Oct 28, 2024
2c36d9e
added new project type OCRTextlineSegmentation
kartikvirendrar Nov 28, 2024
7c65424
Update project_registry.yaml
kartikvirendrar Nov 28, 2024
286b385
added minor changes
KunalTiwary Dec 4, 2024
c43552c
Update views.py
ishvindersethi22 Dec 5, 2024
c56fb8e
Update views.py
tahirjmakhdoomi Dec 5, 2024
8f7a94f
Update annotation_registry.py
tahirjmakhdoomi Dec 5, 2024
9de45a0
Update annotation_registry.py
tahirjmakhdoomi Dec 5, 2024
600b0e8
Update annotation_registry.py
tahirjmakhdoomi Dec 5, 2024
491659d
Update annotation_registry.py
tahirjmakhdoomi Dec 5, 2024
af10b5a
Update views.py
tahirjmakhdoomi Dec 5, 2024
5c6b91d
Update views.py
tahirjmakhdoomi Dec 7, 2024
068a73c
Update views.py
tahirjmakhdoomi Dec 7, 2024
2182c7a
Update views.py
tahirjmakhdoomi Dec 7, 2024
8855826
added fix for draft_data_json
KunalTiwary Dec 8, 2024
31f79a8
added changes in download
KunalTiwary Dec 9, 2024
a6a8b04
Merge pull request #1128 from AI4Bharat/back-brnch-master-ante-changes
ishvindersethi22 Dec 10, 2024
8d9fd12
added minor changes for ocr_te
KunalTiwary Dec 12, 2024
dc78e76
small bug fix
KunalTiwary Dec 13, 2024
1bf9ab0
Merge branch 'back-branch-master' into xlit-proxy-api
ishvindersethi22 Dec 13, 2024
a9d758c
Merge pull request #1127 from AI4Bharat/xlit-proxy-api
ishvindersethi22 Dec 13, 2024
7cb99c9
Added Task Analytics Cron Setup
Shanks0465 Dec 21, 2024
3b5e9df
Added Task Analytics Caching
Shanks0465 Dec 21, 2024
0aed468
Added On Start Trigger for Task Count
Shanks0465 Dec 21, 2024
2c096f2
Added Workspace Task Analytics Cron
Shanks0465 Dec 27, 2024
fddfeb5
Added freeze_task to SpeechConversation and updated assign_new_tasks
Shanks0465 Dec 31, 2024
eae43c2
Added freeze task filter to assign review and supercheck tasks
Shanks0465 Dec 31, 2024
abdc6e6
Added OCRSegmentCategorizationEditing Task Count
Shanks0465 Jan 1, 2025
fed6705
Merge pull request #1132 from AI4Bharat/reports-caching-task-count
ishvindersethi22 Jan 2, 2025
ebf59f1
Updated Task Analytics Cron to 1 hour
Shanks0465 Jan 2, 2025
c5ba18c
Merge branch 'back-branch-master' into reports-caching-task-count
ishvindersethi22 Jan 2, 2025
e08409d
Merge pull request #1134 from AI4Bharat/reports-caching-task-count
ishvindersethi22 Jan 2, 2025
d305844
Updated Task Count Cron with minute set to 0
Shanks0465 Jan 3, 2025
e8bf2e9
Merge pull request #1136 from AI4Bharat/reports-caching-task-count
ishvindersethi22 Jan 3, 2025
e219314
Delete backend/projects/migrations/0053_alter_project_project_type.py
ishvindersethi22 Jan 3, 2025
7bafe1e
Delete backend/users/migrations/0034_alter_user_is_approved.py
ishvindersethi22 Jan 3, 2025
7f71004
Merge branch 'back-branch-master' into speech-task-freeze
ishvindersethi22 Jan 3, 2025
dd36877
Merge pull request #1135 from AI4Bharat/speech-task-freeze
ishvindersethi22 Jan 3, 2025
ffe5658
Update views.py
ishvindersethi22 Jan 4, 2025
2899092
Your commit message explaining the changes
munishmangla98 Jan 7, 2025
cca6c6f
all changes done here
munishmangla98 Jan 15, 2025
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
The table of contents is too big for display.
Diff view
Diff view
  •  
  •  
  •  
The diff you're trying to view is too large. We only load the first 3000 changed files.
5 changes: 4 additions & 1 deletion backend/dataset/admin.py
Original file line number Diff line number Diff line change
@@ -1,4 +1,7 @@
import resource
# import resource
import os
# if os.name != 'nt': # 'nt' means Windows
# import resource # Only import resource on non-Windows platforms
from django.contrib import admin
from import_export.admin import ImportExportActionModelAdmin
from .resources import *
Expand Down
18 changes: 18 additions & 0 deletions backend/dataset/migrations/0047_speechconversation_freeze_task.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,18 @@
# Generated by Django 3.2.14 on 2024-12-31 01:54

from django.db import migrations, models


class Migration(migrations.Migration):

dependencies = [
('dataset', '0046_merge_20240416_2233'),
]

operations = [
migrations.AddField(
model_name='speechconversation',
name='freeze_task',
field=models.BooleanField(default=False, help_text='Field to Indicate whether the current task is frozen by the administrator to prevent being annotated.', verbose_name='freeze_task'),
),
]
7 changes: 7 additions & 0 deletions backend/dataset/models.py
Original file line number Diff line number Diff line change
@@ -1,6 +1,7 @@
"""
Model definitions for Dataset Management
"""

from django.db import models
from users.models import User, LANG_CHOICES
from organizations.models import Organization
Expand Down Expand Up @@ -485,6 +486,12 @@ class SpeechConversation(DatasetBase):
help_text=("Prepopulated prediction for the implemented models"),
)

freeze_task = models.BooleanField(
verbose_name="freeze_task",
default=False,
help_text="Field to Indicate whether the current task is frozen by the administrator to prevent being annotated.",
)

def __str__(self):
return str(self.id)

Expand Down
61 changes: 61 additions & 0 deletions backend/dataset/views.py
Original file line number Diff line number Diff line change
Expand Up @@ -296,6 +296,67 @@ def list(self, request, *args, **kwargs):
dataset_instance["last_upload_result"] = dataset_instance_result

return Response(serializer.data)



# # new optimized code.

def list(self, request, *args, **kwargs):
# Org Owners and superusers see all datasets
if request.user.is_superuser:
queryset = DatasetInstance.objects.all()
elif request.user.role == User.ORGANIZATION_OWNER:
queryset = DatasetInstance.objects.filter(
organisation_id=request.user.organization
)
# Managers only see datasets that they are added to and public datasets
else:
queryset = DatasetInstance.objects.filter(
organisation_id=request.user.organization
).filter(Q(public_to_managers=True) | Q(users__id=request.user.id))

if "dataset_visibility" in request.query_params:
dataset_visibility = request.query_params["dataset_visibility"]
if dataset_visibility == "all_public_datasets":
if (request.user.role == User.WORKSPACE_MANAGER) and (
request.user.is_superuser == False
):
queryset = queryset.filter(public_to_managers=True)
elif dataset_visibility == "my_datasets":
queryset = queryset.filter(users__id=request.user.id)

# Filter the queryset based on the query params
if "dataset_type" in dict(request.query_params):
queryset = queryset.filter(
dataset_type__exact=request.query_params["dataset_type"]
)

# Serialize the distinct items and sort by instance ID
serializer = DatasetInstanceSerializer(
queryset.distinct().order_by("instance_id"), many=True
)

# Add status fields to the serializer data
for dataset_instance in serializer.data:
# Get the task statuses for the dataset instance
(
dataset_instance_status,
dataset_instance_date,
dataset_instance_time,
dataset_instance_result,
) = get_dataset_upload_status(dataset_instance["instance_id"])

# Add the task status and time to the dataset instance response
dataset_instance["last_upload_status"] = dataset_instance_status
dataset_instance["last_upload_date"] = dataset_instance_date
dataset_instance["last_upload_time"] = dataset_instance_time
dataset_instance["last_upload_result"] = dataset_instance_result

return Response(serializer.data)





@is_organization_owner
@action(methods=["GET"], detail=True, name="Download Dataset in CSV format")
Expand Down
178 changes: 170 additions & 8 deletions backend/functions/tasks.py
Original file line number Diff line number Diff line change
@@ -1,7 +1,10 @@
import datetime
import json
import time
import zipfile
import threading

import requests
from azure.storage.blob import BlobServiceClient, generate_blob_sas, BlobSasPermissions
import pandas as pd
from celery import shared_task
Expand Down Expand Up @@ -29,7 +32,7 @@
ANNOTATED,
)
from tasks.views import SentenceOperationViewSet
from users.models import User, LANG_CHOICES
from users.models import User
from django.core.mail import EmailMessage

from utils.blob_functions import (
Expand All @@ -47,7 +50,7 @@
get_batch_asr_predictions,
)
from django.db import transaction, DataError, IntegrityError
from dataset.models import DatasetInstance
from dataset.models import DatasetInstance, SpeechConversation
from django.apps import apps
from rest_framework.test import APIRequestFactory
from django.http import QueryDict
Expand All @@ -56,8 +59,13 @@
import tempfile

from shoonya_backend.locks import Lock

from utils.constants import LANG_CHOICES
from projects.tasks import filter_data_items
from projects.models import BATCH
from dataset import models as dataset_models
from projects.registry_helper import ProjectRegistry
import logging
from tqdm import tqdm

logger = logging.getLogger(__name__)
logging.basicConfig(level=logging.INFO)
Expand All @@ -72,6 +80,10 @@ def sentence_text_translate_and_save_translation_pairs(
input_dataset_instance_id,
output_dataset_instance_id,
batch_size,
filter_string,
sampling_mode,
sampling_parameters,
variable_parameters,
api_type="indic-trans-v2",
checks_for_particular_languages=False,
automate_missing_data_items=True,
Expand All @@ -87,6 +99,10 @@ def sentence_text_translate_and_save_translation_pairs(
Allowed - [indic-trans, google, indic-trans-v2, azure, blank]
checks_for_particular_languages (bool): If True, checks for the particular languages in the translations.
automate_missing_data_items (bool): If True, consider only those data items that are missing in the target dataset instance.
filter_string (str): string to filter input data.
sampling_mode (str): can be batch or full.
sampling_parameters (json): is a json that contains, batch number and batch size

"""
task_name = "sentence_text_translate_and_save_translation_pairs"
output_sentences = list(
Expand All @@ -113,6 +129,14 @@ def sentence_text_translate_and_save_translation_pairs(
"metadata_json",
)
)
if filter_string and sampling_mode and sampling_parameters:
input_sentences = get_filtered_items(
"SentenceText",
input_dataset_instance_id,
filter_string,
sampling_mode,
sampling_parameters,
)

# Convert the input_sentences list into a dataframe
input_sentences_complete_df = pd.DataFrame(
Expand Down Expand Up @@ -403,7 +427,15 @@ def conversation_data_machine_translation(

@shared_task(bind=True)
def generate_ocr_prediction_json(
self, dataset_instance_id, user_id, api_type, automate_missing_data_items
self,
dataset_instance_id,
user_id,
api_type,
automate_missing_data_items,
filter_string,
sampling_mode,
sampling_parameters,
variable_parameters,
):
"""Function to generate OCR prediction data and to save to the same data item.
Args:
Expand Down Expand Up @@ -436,7 +468,14 @@ def generate_ocr_prediction_json(
)
except Exception as e:
ocr_data_items = []

if filter_string and sampling_mode and sampling_parameters:
ocr_data_items = get_filtered_items(
"OCRDocument",
dataset_instance_id,
filter_string,
sampling_mode,
sampling_parameters,
)
# converting the dataset_instance to pandas dataframe.
ocr_data_items_df = pd.DataFrame(
ocr_data_items,
Expand Down Expand Up @@ -555,7 +594,15 @@ def generate_ocr_prediction_json(

@shared_task(bind=True)
def generate_asr_prediction_json(
self, dataset_instance_id, user_id, api_type, automate_missing_data_items
self,
dataset_instance_id,
user_id,
api_type,
automate_missing_data_items,
filter_string,
sampling_mode,
sampling_parameters,
variable_parameters,
):
"""Function to generate ASR prediction data and to save to the same data item.
Args:
Expand Down Expand Up @@ -589,7 +636,14 @@ def generate_asr_prediction_json(
)
except Exception as e:
asr_data_items = []

if filter_string and sampling_mode and sampling_parameters:
asr_data_items = get_filtered_items(
"SpeechConversation",
dataset_instance_id,
filter_string,
sampling_mode,
sampling_parameters,
)
# converting the dataset_instance to pandas dataframe.
asr_data_items_df = pd.DataFrame(
asr_data_items,
Expand Down Expand Up @@ -703,7 +757,16 @@ def generate_asr_prediction_json(


@shared_task(bind=True)
def populate_draft_data_json(self, pk, user_id, fields_list):
def populate_draft_data_json(
self,
pk,
user_id,
fields_list,
filter_string,
sampling_mode,
sampling_parameters,
variable_parameters,
):
task_name = "populate_draft_data_json"
try:
dataset_instance = DatasetInstance.objects.get(pk=pk)
Expand All @@ -712,6 +775,10 @@ def populate_draft_data_json(self, pk, user_id, fields_list):
dataset_type = dataset_instance.dataset_type
dataset_model = apps.get_model("dataset", dataset_type)
dataset_items = dataset_model.objects.filter(instance_id=dataset_instance)
if filter_string and sampling_mode and sampling_parameters:
dataset_items = get_filtered_items(
dataset_type, pk, filter_string, sampling_mode, sampling_parameters
)
cnt = 0
for dataset_item in dataset_items:
new_draft_data_json = {}
Expand Down Expand Up @@ -1695,3 +1762,98 @@ def upload_all_projects_to_blob_and_get_url(csv_files_directory):
return "Error in generating url"
blob_url = f"https://{account_name}.blob.{endpoint_suffix}/{CONTAINER_NAME_FOR_DOWNLOAD_ALL_PROJECTS}/{blob_client.blob_name}?{sas_token}"
return blob_url


def get_filtered_items(
dataset_model,
dataset_instance_id,
filter_string,
sampling_mode,
sampling_parameters,
):
registry_helper = ProjectRegistry.get_instance()
project_type = registry_helper.get_project_name_from_dataset(dataset_model)
if not isinstance(dataset_instance_id, list):
dataset_instance_id = [dataset_instance_id]
filtered_items = filter_data_items(
project_type=project_type,
dataset_instance_ids=dataset_instance_id,
filter_string=filter_string,
)
# Apply sampling
if sampling_mode == BATCH:
batch_size = sampling_parameters["batch_size"]
try:
batch_number = sampling_parameters["batch_number"]
if len(batch_number) == 0:
batch_number = [1]
except KeyError:
batch_number = [1]
sampled_items = []
for batch_num in batch_number:
sampled_items += filtered_items[
batch_size * (batch_num - 1) : batch_size * batch_num
]
else:
sampled_items = filtered_items
return sampled_items


@shared_task(
bind=True,
)
def update_SpeechConversation(self, lang, pid, auto_annotation, user_id):
UPDATE_SPEECH_CONVERSATION_API_URL = os.getenv("UPDATE_SPEECH_CONVERSATION_API_URL")
user_name = User.objects.filter(id=user_id)[0].username
data_item_list = [
t.input_data_id
for t in Task.objects.filter(project_id=pid, task_status="incomplete")
]
tasks_objects = Task.objects.filter(project_id=pid, task_status="incomplete")
related_tasks_ids = [task.id for task in tasks_objects]
related_annos = Annotation.objects.filter(task__id__in=related_tasks_ids)
for anno in related_annos:
anno.delete()
for task in tasks_objects:
task.delete()
data_items = SpeechConversation.objects.filter(id__in=data_item_list)
data_items_list = []
for data_item in tqdm(data_items):
try:
MEDIA_URL = data_item.audio_url
pred_json = (
json.loads(data_item.prediction_json)
if isinstance(data_item.prediction_json, str)
else data_item.prediction_json
)
data = [{"audioUrl": MEDIA_URL, "audioJson": pred_json, "audioLang": lang}]
pred_text_json = requests.post(
UPDATE_SPEECH_CONVERSATION_API_URL, json=json.dumps(data)
)
json_pred_final = json.loads(pred_text_json.text)[0]
except:
pass
setattr(data_item, "prediction_json", json_pred_final)
data_items_list.append(data_item)
SpeechConversation.objects.bulk_update(data_items_list, ["prediction_json"], 512)

data_items_list = []
for data_item in tqdm(data_items):
new_draft_data_json = {}
pred_json = (
json.loads(data_item.prediction_json)
if isinstance(data_item.prediction_json, str)
else data_item.prediction_json
)
try:
new_draft_data_json["transcribed_json"] = getattr(
data_item, "prediction_json"
)
if new_draft_data_json["transcribed_json"] == "None":
del new_draft_data_json["transcribed_json"]
except:
pass
setattr(data_item, "draft_data_json", new_draft_data_json)
data_items_list.append(data_item)
SpeechConversation.objects.bulk_update(data_items_list, ["draft_data_json"], 512)
print(f"SpeechConversation Dataset updated for {pid} by {user_name}")
1 change: 1 addition & 0 deletions backend/functions/urls.py
Original file line number Diff line number Diff line change
Expand Up @@ -29,6 +29,7 @@
),
path("schedule_project_reports_email", schedule_project_reports_email),
path("download_all_projects", download_all_projects),
path("schedule_update_SpeechConversation", schedule_update_SpeechConversation),
]

# urlpatterns = format_suffix_patterns(urlpatterns)
Loading