Skip to content

Commit

Permalink
Fix dataloader for minimayosrs (bigscience-workshop#540)
Browse files Browse the repository at this point in the history
* Reworked dataloader

- Changed datasource since other one was faulty
- Added license
- Label in bigbio is now avg of physician/coder rather than just physic

* Change doc_id from NULL to sequential id
  • Loading branch information
nomisto authored May 4, 2022
1 parent e24a20e commit 4c7066e
Showing 1 changed file with 32 additions and 106 deletions.
138 changes: 32 additions & 106 deletions biodatasets/minimayosrs/minimayosrs.py
Original file line number Diff line number Diff line change
Expand Up @@ -18,10 +18,11 @@
nine medical coders and three physicians from the Mayo Clinic.
"""

import csv
from typing import Dict, List, Tuple

import datasets
import pandas as pd

from bigbio.utils import schemas
from bigbio.utils.configs import BigBioConfig
from bigbio.utils.constants import Tasks
Expand All @@ -46,26 +47,15 @@
achieved. The average correlation between physicians is 0.68. The average correlation between medical coders is 0.78.
"""

_HOMEPAGE = "https://nlp.cs.vcu.edu/data.html#minimayosrs"
_HOMEPAGE = "https://conservancy.umn.edu/handle/11299/196265"

_LICENSE = "Unknown"
_LICENSE = "CC0 1.0 Universal"

_URLS = {
"source": [
"https://nlp.cs.vcu.edu/data/similarity-data/MiniMayoSRS.terms",
"https://nlp.cs.vcu.edu/data/similarity-data/MiniMayoSRS.physicians",
"https://nlp.cs.vcu.edu/data/similarity-data/MiniMayoSRS.coders",
],
"bigbio_pairs": [
"https://nlp.cs.vcu.edu/data/similarity-data/MiniMayoSRS.terms",
"https://nlp.cs.vcu.edu/data/similarity-data/MiniMayoSRS.physicians",
"https://nlp.cs.vcu.edu/data/similarity-data/MiniMayoSRS.coders",
],
_DATASETNAME: "https://conservancy.umn.edu/bitstream/handle/11299/196265/MiniMayoSRS.csv?sequence=2&isAllowed=y"
}

_SUPPORTED_TASKS = [
Tasks.SEMANTIC_SIMILARITY
] # example: [Tasks.TRANSLATION, Tasks.NAMED_ENTITY_RECOGNITION, Tasks.RELATION_EXTRACTION]
_SUPPORTED_TASKS = [Tasks.SEMANTIC_SIMILARITY]

_SOURCE_VERSION = "1.0.0"

Expand Down Expand Up @@ -107,12 +97,10 @@ def _info(self) -> datasets.DatasetInfo:
{
"text_1": datasets.Value("string"),
"text_2": datasets.Value("string"),
"code_1": datasets.Value("string"),
"code_2": datasets.Value("string"),
"label_physicians": datasets.Value("float32"),
"code_1_physicians": datasets.Value("string"),
"code_2_physicians": datasets.Value("string"),
"label_coders": datasets.Value("float32"),
"code_1_coders": datasets.Value("string"),
"code_2_coders": datasets.Value("string"),
}
)

Expand All @@ -130,98 +118,36 @@ def _info(self) -> datasets.DatasetInfo:
def _split_generators(self, dl_manager) -> List[datasets.SplitGenerator]:
"""Returns SplitGenerators."""

urls = _URLS[self.config.schema]
data_dir = dl_manager.download_and_extract(urls)
urls = _URLS[_DATASETNAME]
filepath = dl_manager.download_and_extract(urls)

return [
datasets.SplitGenerator(
name=datasets.Split.TRAIN,
gen_kwargs={
"filepath": data_dir,
"split": "train",
},
gen_kwargs={"filepath": filepath},
)
]

def _generate_examples(self, filepath, split: str) -> Tuple[int, Dict]:
def _generate_examples(self, filepath) -> Tuple[int, Dict]:
"""Yields examples as (key, example) tuples."""

if split == "train":
# texts, physicians, coders
texts = filepath[0]
physicians = filepath[1]
coders = filepath[2]

data_texts = []
with open(texts, encoding="utf-8") as csv_file:
csv_reader_texts = csv.reader(
csv_file, quotechar='"', delimiter=",", quoting=csv.QUOTE_ALL, skipinitialspace=True
)
for id_, row in enumerate(csv_reader_texts):
text_1, text_2 = row[0].split("<>")
data_texts.append([text_1, text_2])

data_physicians = []
with open(physicians, encoding="utf-8") as csv_file:
csv_reader_physicians = csv.reader(
csv_file, quotechar='"', delimiter=",", quoting=csv.QUOTE_ALL, skipinitialspace=True
)

for id_, row in enumerate(csv_reader_physicians):
label_physicians, code_1_physicians, code_2_physicians = row[0].split("<>")
data_physicians.append([label_physicians, code_1_physicians, code_2_physicians])

data_coders = []
with open(coders, encoding="utf-8") as csv_file:
csv_reader_coders = csv.reader(
csv_file, quotechar='"', delimiter=",", quoting=csv.QUOTE_ALL, skipinitialspace=True
)

for id_, row in enumerate(csv_reader_coders):
label_coders, code_1_coders, code_2_coders = row[0].split("<>")
data_coders.append([label_coders, code_1_coders, code_2_coders])

data = []
for i in range(len(data_coders)):
data.append(sum(list(zip(data_texts, data_physicians, data_coders))[i], []))

if self.config.schema == "source":
for id_, row in enumerate(data):
(
text_1,
text_2,
label_physicians,
code_1_physicians,
code_2_physicians,
label_coders,
code_1_coders,
code_2_coders,
) = row

yield id_, {
"text_1": text_1,
"text_2": text_2,
"label_physicians": float(label_physicians),
"code_1_physicians": code_1_physicians,
"code_2_physicians": code_2_physicians,
"label_coders": float(label_coders),
"code_1_coders": code_1_coders,
"code_2_coders": code_2_coders,
}

elif self.config.schema == "bigbio_pairs":
uid = 0
for id_, row in enumerate(data):
uid += 1
text_1, text_2, label_physicians, _, _, _, _, _ = row
yield id_, {
"id": uid, # uid is an unique identifier for every record that starts from 1
"document_id": "NULL",
"text_1": text_1,
"text_2": text_2,
"label": str(label_physicians),
}

else:
print("There's no test/val split available for the given dataset")
return
data = pd.read_csv(
filepath,
sep=",",
header=0,
names=["label_physicians", "label_coders", "code_1", "code_2", "text_1", "text_2"],
)

if self.config.schema == "source":
for id_, row in data.iterrows():
yield id_, row.to_dict()

elif self.config.schema == "bigbio_pairs":
for id_, row in data.iterrows():
yield id_, {
"id": id_,
"document_id": id_,
"text_1": row["text_1"],
"text_2": row["text_2"],
"label": str((row["label_physicians"] + row["label_coders"]) / 2),
}

0 comments on commit 4c7066e

Please sign in to comment.