Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Mega more data entry #132

Merged
merged 7 commits into from
Nov 15, 2020
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
8 changes: 5 additions & 3 deletions miniconf/load_site_data.py
Original file line number Diff line number Diff line change
Expand Up @@ -790,10 +790,12 @@ def build_workshop_blocks(t: Dict[str, Any]) -> List[SessionInfo]:
authors=extract_list_field(item, "authors"),
track=workshop_title(workshop_id),
paper_type="Workshop",
abstract=None,
tldr=None,
abstract=item.get("abstract"),
tldr=item["abstract"][:250] + "..."
if item["abstract"]
else None,
keywords=[],
pdf_url=None,
pdf_url=item.get("pdf_url"),
demo_url=None,
sessions=[],
similar_paper_uids=[],
Expand Down
5 changes: 3 additions & 2 deletions scripts/dataentry/socials.py
Original file line number Diff line number Diff line change
Expand Up @@ -37,7 +37,7 @@ def generate_socials():
],
)
df = df.dropna(subset=["ID"])
df = df[:-1]
df = df.drop([df.index[-2]])

zoom_df = pd.read_excel(
PATH_ZOOM_ACCOUNTS_WITH_PASSWORDS, sheet_name="Affinity"
Expand All @@ -57,6 +57,7 @@ def generate_socials():
id_to_organizers = {
row["ID"]: [e.strip() for e in row["Organizers"].split(",")]
for _, row in df.iterrows()
if row["Organizers"]
}
id_to_name = {row["ID"]: row["Event"] for _, row in df.iterrows()}
id_to_channel = {row["ID"]: row["Channel Name"] for _, row in df.iterrows()}
Expand Down Expand Up @@ -122,7 +123,7 @@ def generate_socials():
for idx, row in df.iterrows():
name = "S-" + row["Session Name"].strip()

if uid.startswith("B") and row["Host"]:
if (uid.startswith("B") or uid.startswith("M")) and row["Host"]:
name = name + " with " + row["Host"]

day = row["Day"]
Expand Down
262 changes: 210 additions & 52 deletions scripts/dataentry/workshops.py
Original file line number Diff line number Diff line change
@@ -1,27 +1,40 @@
import csv
from collections import defaultdict
from dataclasses import dataclass
import random
from typing import List, Dict, Any
import re
from collections import defaultdict
from dataclasses import dataclass
from datetime import datetime, timedelta
from typing import Any
from typing import List, Dict
import xml.etree.ElementTree as ET

import pandas as pd
import pytz
import ruamel
from ftfy import fix_text
from openpyxl import load_workbook
from pylatexenc.latex2text import LatexNodes2Text
from ruamel import yaml

import numpy as np

from datetime import datetime, time, timedelta

import pandas as pd

import pytz
from ruamel import yaml

# https://docs.google.com/spreadsheets/d/19LRnJpae5NQd0D1NEO40kTbwDvS9f125tpsjBdevrcs/edit#gid=0
from scripts.dataentry.paths import *

fix = {
"490": "4th Workshop on Structured Prediction for NLP",
"510": "CoNLL 2020",
"884": "Deep Learning Inside Out (DeeLIO): The First Workshop on Knowledge Extraction and Integration for Deep Learning Architectures",
"1093": "SIGTYP 2020: The Second Workshop on Computational Research in Linguistic Typology",
"1761": "Search-Oriented Conversational AI (SCAI) 2",
"2217": "The Fourth Workshop on Online Abuse and Harms (WOAH) a.k.a. ALW",
"2487": "1st Workshop on Computational Approaches to Discourse",
"2575": "Workshop on Insights from Negative Results in NLP",
"2797": "Deep Learning Inside Out (DeeLIO): The First Workshop on Knowledge Extraction and Integration for Deep Learning Architectures",
"2800": "Deep Learning Inside Out (DeeLIO): The First Workshop on Knowledge Extraction and Integration for Deep Learning Architectures",
"2976": "BlackboxNLP 2020: Analyzing and interpreting neural networks for NLP",
"3476": "Interactive and Executable Semantic Parsing (Int-Ex)",
"3561": "BlackboxNLP 2020: Analyzing and interpreting neural networks for NLP",
}


@dataclass
class Session:
Expand All @@ -38,6 +51,17 @@ class Workshop:
description: str


@dataclass
class Paper:
uid: str
title: str
authors: str
abstract: str
track: str
kind: str
link: str


def load_workshop_overview_excel() -> pd.DataFrame:
wb = load_workbook(PATH_WORKSHOPS_OVERVIEW)
ws = wb.worksheets[0]
Expand Down Expand Up @@ -132,7 +156,6 @@ def load_schedule():
for ws in wb.worksheets[4:]:
workshop_id = ws["B2"].value
assert workshop_id.startswith("WS-"), "Does not start with WS: " + workshop_id
print(workshop_id, ws.title)

description = ws["B3"].value or ""
ws.delete_rows(1, 6)
Expand Down Expand Up @@ -205,35 +228,9 @@ def load_slideslive():

workshop_df = load_workshop_overview_excel()

fix = {
"490": "4th Workshop on Structured Prediction for NLP",
"510": "CoNLL 2020",
"884": "Deep Learning Inside Out (DeeLIO): The First Workshop on Knowledge Extraction and Integration for Deep Learning Architectures",
"1093": "SIGTYP 2020: The Second Workshop on Computational Research in Linguistic Typology",
"1761": "Search-Oriented Conversational AI (SCAI) 2",
"2217": "The Fourth Workshop on Online Abuse and Harms (WOAH) a.k.a. ALW",
"2487": "1st Workshop on Computational Approaches to Discourse",
"2575": "Workshop on Insights from Negative Results in NLP",
"2797": "Deep Learning Inside Out (DeeLIO): The First Workshop on Knowledge Extraction and Integration for Deep Learning Architectures",
"2800": "Deep Learning Inside Out (DeeLIO): The First Workshop on Knowledge Extraction and Integration for Deep Learning Architectures",
"2976": "BlackboxNLP 2020: Analyzing and interpreting neural networks for NLP",
"3476": "Interactive and Executable Semantic Parsing (Int-Ex)",
"3561": "BlackboxNLP 2020: Analyzing and interpreting neural networks for NLP",
}

ws_name_to_id = {
row["Name"]: row["UID"].strip() for _, row in workshop_df.iterrows()
}
corrected_venues = []
for _, row in df.iterrows():
venue_id = row["Organizer track name"]
if row["Unique ID"].strip() in fix:
correct_venue_name = fix[row["Unique ID"]]
venue_id = ws_name_to_id[correct_venue_name]

corrected_venues.append(venue_id)

df["Organizer track name"] = corrected_venues

return df

Expand All @@ -251,44 +248,205 @@ def generate_workshop_papers(slideslive: pd.DataFrame):

title = row["Title"].replace("\n", " ")
title = LatexNodes2Text().latex_to_text(title)
title = fix_text(title)
author_list = [fix_text(e.strip()) for e in re.split(",| and | And ", row["Speakers"])]
title = fix_text(title).strip()
author_list = [
fix_text(e.strip()) for e in re.split(",| and | And ", row["Speakers"])
]

ws = row["Organizer track name"].strip()
uid = row["Unique ID"].strip()

print(ws, uid)
if ws == "WS-15" and str(uid) in fix.keys():
continue

venues.append(ws)
UIDs.append(f"{ws}.{uid}")
titles.append(title)
authors.append(
"|".join(author_list)
)
authors.append("|".join(author_list))
presentation_ids.append(
row["SlidesLive link"].replace("https://slideslive.com/", "")
)

anthology_papers = get_anthology_workshop_papers()
title_to_anthology_paper = {a.title.strip().lower(): a for a in anthology_papers}
author_to_anthology_paper = {a.authors.lower(): a for a in anthology_papers}

unmatched = []
uid_to_anthology_paper = {}
for uid, title, author in zip(UIDs, titles, authors):
if uid.startswith(("WS-2")):
continue

if title.lower() in title_to_anthology_paper:
assert uid not in uid_to_anthology_paper
uid_to_anthology_paper[uid] = title_to_anthology_paper[title.lower()]
else:
unmatched.append((uid, title, author.lower()))

for uid, title, author in list(unmatched):
if author.lower() in author_to_anthology_paper:
assert uid not in uid_to_anthology_paper, (
uid,
title,
author,
uid_to_anthology_paper[uid],
)
uid_to_anthology_paper[uid] = author_to_anthology_paper[author.lower()]
unmatched.remove((uid, title, author.lower()))

unmatched_df = pd.DataFrame(unmatched)
unmatched_df.to_csv("unmatched_workshop_papers.csv", index=False)
for e in unmatched:
print(e)

print(len(unmatched), len(uid_to_anthology_paper))

abstracts = []
urls = []
for uid in UIDs:
if uid in uid_to_anthology_paper:
paper = uid_to_anthology_paper[uid]
abstracts.append(paper.abstract)
urls.append(paper.link)
else:
abstracts.append("")
urls.append("")

data = {
"workshop": venues,
"UID": UIDs,
"title": titles,
"authors": authors,
"abstract": abstracts,
"presentation_id": presentation_ids,
"pdf_url": urls,
}

columns = ["workshop", "UID", "title", "authors", "presentation_id"]
columns = [
"workshop",
"UID",
"title",
"authors",
"abstract",
"presentation_id",
"pdf_url",
]
df = pd.DataFrame(data, columns=columns)
df = df.drop_duplicates(subset=["UID"])

df.to_csv(PATH_YAMLS / "workshop_papers.csv", index=False)


def get_anthology_workshop_papers() -> List[Paper]:
anthology = (
Path(
r"C:\Users\klie\AppData\Roaming\JetBrains\PyCharm2020.2\scratches\emnlp\acl-anthology"
)
/ "data"
)

conference = "emnlp"
year = 2020

mapping = {
"2020.conll-1": "WS-1",
"2020.alw-1": "WS-17",
"2020.blackboxnlp-1": "WS-25",
"2020.clinicalnlp-1": "WS-12",
"2020.cmcl-1": "WS-5",
"2020.codi-1": "WS-16",
"2020.deelio-1": "WS-13",
"2020.eval4nlp-1": "WS-20",
"2020.insights-1": "WS-3",
"2020.intexsempar-1": "WS-6",
"2020.louhi-1": "WS-19",
"2020.nlpbt-1": "WS-23",
"2020.nlpcovid19-1": "WS-26",
"2020.nlpcss-1": "WS-18",
"2020.nlposs-1": "WS-9",
"2020.privatenlp-1": "WS-24",
"2020.scai-1": "WS-4",
"2020.sdp-1": "WS-7",
"2020.sigtyp-1": "WS-11",
"2020.splu-1": "WS-10",
"2020.spnlp-1": "WS-21",
"2020.sustainlp-1": "WS-15",
"2020.wnut-1": "WS-14",
"2020.findings-1": "findings",
}

papers = []
for venue in mapping.keys():
if venue.endswith("-1"):
file_name = venue[:-2]
else:
file_name = venue

path_to_xml = anthology / "xml" / f"{file_name}.xml"
tree = ET.parse(path_to_xml)
root = tree.getroot()
collection_id = root.attrib["id"]

for volume in root.findall("volume"):

volume_id = volume.attrib["id"]

for paper in volume.findall("paper"):
paper_id = paper.attrib["id"]
title = "".join(paper.find("title").itertext())
uid = f"{collection_id}-{volume_id}.{paper_id}"
authors = [
" ".join(author.itertext()) for author in paper.findall("author")
]
authors = "|".join(authors)

if paper.find("abstract") is not None:
abstract = "".join(paper.find("abstract").itertext())
else:
abstract = ""

link = f"https://www.aclweb.org/anthology/{uid}"

track = mapping[venue]
kind = None

if track.startswith("W"):
kind = "workshop"
elif track == "main":
kind = "long"
else:
kind = "findings"

assert kind

paper = Paper(
uid=uid,
title=title,
authors=authors,
abstract=abstract,
track=track,
kind=kind,
link=link,
)

papers.append(paper)

return papers


def is_not_paper(row) -> bool:
uid = row["Unique ID"].lower()
title = row["Title"].lower()

return (
"invited" in uid
or "challenge" in uid
or "invited" in title
or row["Unique ID"] == "Shared task"
("invited" in uid)
or ("challenge" in uid)
or ("invited" in title)
or ("keynote" in title)
or ("keynote" in uid)
or (row["Unique ID"] == "Shared task")
or (title == "tba" and "paper" not in uid)
)


Expand Down Expand Up @@ -325,9 +483,9 @@ def get_zooms() -> Dict[str, List[str]]:


if __name__ == "__main__":
#download_slideslive()
#download_workshops()
#download_zooms()
# download_slideslive()
# download_workshops()
# download_zooms()

# load_csv()
data = build_workshops_basics()
Expand Down
2 changes: 1 addition & 1 deletion sitedata/config.yml
Original file line number Diff line number Diff line change
Expand Up @@ -23,7 +23,7 @@ default_presentation_id: 38931484
# Remove or change to your chat server.
chat_server: emnlp2020.rocket.chat

gather_town: https://gather.town/
gather_town: https://www.virtualchair.net/events/emnlp2020
connected_papers_conference_name: EMNLP2020


Expand Down
Loading