diff --git a/README.md b/README.md index 59205cb..ebb66f9 100644 --- a/README.md +++ b/README.md @@ -1,5 +1,9 @@ # corpus-filtering +## TODOs + +- [ ] add minicons, ipykernel, statsmodels to environment + ## Development & Contribution Guidelines ### Basic Setup diff --git a/results/analysis.ipynb b/results/analysis.ipynb new file mode 100644 index 0000000..195611b --- /dev/null +++ b/results/analysis.ipynb @@ -0,0 +1,308 @@ +{ + "cells": [ + { + "cell_type": "code", + "execution_count": 35, + "metadata": {}, + "outputs": [], + "source": [ + "import pandas as pd\n", + "import scipy\n", + "import statsmodels.api as sm\n", + "import statsmodels.formula.api as smf" + ] + }, + { + "cell_type": "code", + "execution_count": 2, + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "{'binding-reconstruction', 'npi-sent-neg', 'npi-only', 'binding-case', 'passive', 'superlative-quantifier', 'pp-mod-subj', 'existential-there-quantifier', 'det-adj-noun', 'full', 'det-noun', 're-irr-sv-agr', 'binding-domain', 'rel-cl', 'binding-c-command', 'npi-sim-ques'}\n", + "{'determiner_noun_agreement_1', 'principle_A_domain_1', 'determiner_noun_agreement_with_adjective_1', 'determiner_noun_agreement_with_adj_irregular_2', 'sentential_negation_npi_licensor_present', 'regular_plural_subject_verb_agreement_1', 'principle_A_case_2', 'passive_1', 'determiner_noun_agreement_irregular_2', 'regular_plural_subject_verb_agreement_2', 'principle_A_c_command', 'determiner_noun_agreement_with_adj_irregular_1', 'matrix_question_npi_licensor_present', 'only_npi_scope', 'sentential_negation_npi_scope', 'determiner_noun_agreement_with_adj_2', 'principle_A_case_1', 'principle_A_domain_3', 'existential_there_quantifiers_1', 'passive_2', 'superlative_quantifiers_2', 'irregular_plural_subject_verb_agreement_2', 'principle_A_reconstruction', 'superlative_quantifiers_1', 'determiner_noun_agreement_irregular_1', 'only_npi_licensor_present', 'principle_A_domain_2', 'distractor_agreement_relational_noun', 'irregular_plural_subject_verb_agreement_1', 'determiner_noun_agreement_2', 'distractor_agreement_relative_clause'}\n" + ] + } + ], + "source": [ + "blimp_corpus_map = {\n", + " \"full\": [],\n", + " \"pp-mod-subj\": [\"distractor_agreement_relational_noun\"],\n", + " \"rel-cl\": [\"distractor_agreement_relative_clause\"],\n", + " \"re-irr-sv-agr\": [\n", + " \"irregular_plural_subject_verb_agreement_1\",\n", + " \"irregular_plural_subject_verb_agreement_2\",\n", + " \"regular_plural_subject_verb_agreement_1\",\n", + " \"regular_plural_subject_verb_agreement_2\",\n", + " ],\n", + " \"npi-only\": [\"only_npi_licensor_present\", \"only_npi_scope\"],\n", + " \"npi-sent-neg\": [\n", + " \"sentential_negation_npi_licensor_present\",\n", + " \"sentential_negation_npi_scope\",\n", + " ],\n", + " \"npi-sim-ques\": [\"matrix_question_npi_licensor_present\"],\n", + " \"superlative-quantifier\": [\n", + " \"superlative_quantifiers_1\",\n", + " \"superlative_quantifiers_2\",\n", + " ],\n", + " \"existential-there-quantifier\": [\"existential_there_quantifiers_1\"],\n", + " \"binding-c-command\": [\"principle_A_c_command\"],\n", + " \"binding-case\": [\"principle_A_case_1\", \"principle_A_case_2\"],\n", + " \"binding-domain\": [\n", + " \"principle_A_domain_1\",\n", + " \"principle_A_domain_2\",\n", + " \"principle_A_domain_3\",\n", + " ],\n", + " \"binding-reconstruction\": [\"principle_A_reconstruction\"],\n", + " \"passive\": [\"passive_1\", \"passive_2\"],\n", + " \"det-adj-noun\": [\n", + " \"determiner_noun_agreement_with_adjective_1\",\n", + " \"determiner_noun_agreement_with_adj_2\",\n", + " \"determiner_noun_agreement_with_adj_irregular_1\",\n", + " \"determiner_noun_agreement_with_adj_irregular_2\",\n", + " ],\n", + " \"det-noun\": [\n", + " \"determiner_noun_agreement_1\",\n", + " \"determiner_noun_agreement_2\",\n", + " \"determiner_noun_agreement_irregular_1\",\n", + " \"determiner_noun_agreement_irregular_2\",\n", + " ],\n", + "}\n", + "all_filters = set(blimp_corpus_map.keys())\n", + "benchmarks_with_filters = set([_ for xs in blimp_corpus_map.values() for _ in xs])\n", + "print(all_filters)\n", + "print(benchmarks_with_filters)" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "# Read the main data" + ] + }, + { + "cell_type": "code", + "execution_count": 22, + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "Index(['corpus', 'arch', 'seed', 'validation_loss', 'validation_ppl',\n", + " 'test_loss', 'test_ppl', 'blimp_benchmark', 'blimp_acc',\n", + " 'filter_target', 'z_validation_ppl', 'z_validation_loss', 'z_test_ppl',\n", + " 'z_test_loss', 'full_same_seed_acc', 'blimp_delta_same_seed',\n", + " 'full_all_seed_avg_acc', 'blimp_delta', 'field', 'linguistics_term',\n", + " 'corpus_tokens'],\n", + " dtype='object')\n", + "Unexpected exception formatting exception. Falling back to standard exception\n" + ] + }, + { + "name": "stderr", + "output_type": "stream", + "text": [ + "Traceback (most recent call last):\n", + " File \"/Users/shanest/opt/anaconda3/envs/corpus-filtering/lib/python3.9/site-packages/IPython/core/formatters.py\", line 223, in catch_format_error\n", + " r = method(self, *args, **kwargs)\n", + " File \"/Users/shanest/opt/anaconda3/envs/corpus-filtering/lib/python3.9/site-packages/IPython/core/formatters.py\", line 344, in __call__\n", + " return method()\n", + " File \"/Users/shanest/opt/anaconda3/envs/corpus-filtering/lib/python3.9/site-packages/pandas/core/frame.py\", line 1106, in _repr_html_\n", + " File \"/Users/shanest/opt/anaconda3/envs/corpus-filtering/lib/python3.9/site-packages/pandas/io/formats/format.py\", line 1110, in to_html\n", + " When formatting an Index subclass\n", + " File \"/Users/shanest/opt/anaconda3/envs/corpus-filtering/lib/python3.9/site-packages/pandas/io/formats/html.py\", line 88, in to_string\n", + " lines = self.render()\n", + " File \"/Users/shanest/opt/anaconda3/envs/corpus-filtering/lib/python3.9/site-packages/pandas/io/formats/html.py\", line 644, in render\n", + " super().render()\n", + " File \"/Users/shanest/opt/anaconda3/envs/corpus-filtering/lib/python3.9/site-packages/pandas/io/formats/html.py\", line 94, in render\n", + " self._write_table()\n", + " File \"/Users/shanest/opt/anaconda3/envs/corpus-filtering/lib/python3.9/site-packages/pandas/io/formats/html.py\", line 267, in _write_table\n", + " self._write_header(indent + self.indent_delta)\n", + " File \"/Users/shanest/opt/anaconda3/envs/corpus-filtering/lib/python3.9/site-packages/pandas/io/formats/html.py\", line 403, in _write_header\n", + " self._write_col_header(indent + self.indent_delta)\n", + " File \"/Users/shanest/opt/anaconda3/envs/corpus-filtering/lib/python3.9/site-packages/pandas/io/formats/html.py\", line 383, in _write_col_header\n", + " row.extend(self._get_columns_formatted_values())\n", + " File \"/Users/shanest/opt/anaconda3/envs/corpus-filtering/lib/python3.9/site-packages/pandas/io/formats/html.py\", line 611, in _get_columns_formatted_values\n", + " return self.columns._format_flat(include_name=False)\n", + "AttributeError: 'Index' object has no attribute '_format_flat'\n", + "\n", + "During handling of the above exception, another exception occurred:\n", + "\n", + "Traceback (most recent call last):\n", + " File \"/Users/shanest/opt/anaconda3/envs/corpus-filtering/lib/python3.9/site-packages/IPython/core/interactiveshell.py\", line 2102, in showtraceback\n", + " stb = self.InteractiveTB.structured_traceback(\n", + " File \"/Users/shanest/opt/anaconda3/envs/corpus-filtering/lib/python3.9/site-packages/IPython/core/ultratb.py\", line 1310, in structured_traceback\n", + " return FormattedTB.structured_traceback(\n", + " File \"/Users/shanest/opt/anaconda3/envs/corpus-filtering/lib/python3.9/site-packages/IPython/core/ultratb.py\", line 1199, in structured_traceback\n", + " return VerboseTB.structured_traceback(\n", + " File \"/Users/shanest/opt/anaconda3/envs/corpus-filtering/lib/python3.9/site-packages/IPython/core/ultratb.py\", line 1052, in structured_traceback\n", + " formatted_exception = self.format_exception_as_a_whole(etype, evalue, etb, number_of_lines_of_context,\n", + " File \"/Users/shanest/opt/anaconda3/envs/corpus-filtering/lib/python3.9/site-packages/IPython/core/ultratb.py\", line 978, in format_exception_as_a_whole\n", + " frames.append(self.format_record(record))\n", + " File \"/Users/shanest/opt/anaconda3/envs/corpus-filtering/lib/python3.9/site-packages/IPython/core/ultratb.py\", line 878, in format_record\n", + " frame_info.lines, Colors, self.has_colors, lvals\n", + " File \"/Users/shanest/opt/anaconda3/envs/corpus-filtering/lib/python3.9/site-packages/IPython/core/ultratb.py\", line 712, in lines\n", + " return self._sd.lines\n", + " File \"/Users/shanest/opt/anaconda3/envs/corpus-filtering/lib/python3.9/site-packages/stack_data/utils.py\", line 144, in cached_property_wrapper\n", + " value = obj.__dict__[self.func.__name__] = self.func(obj)\n", + " File \"/Users/shanest/opt/anaconda3/envs/corpus-filtering/lib/python3.9/site-packages/stack_data/core.py\", line 734, in lines\n", + " pieces = self.included_pieces\n", + " File \"/Users/shanest/opt/anaconda3/envs/corpus-filtering/lib/python3.9/site-packages/stack_data/utils.py\", line 144, in cached_property_wrapper\n", + " value = obj.__dict__[self.func.__name__] = self.func(obj)\n", + " File \"/Users/shanest/opt/anaconda3/envs/corpus-filtering/lib/python3.9/site-packages/stack_data/core.py\", line 681, in included_pieces\n", + " pos = scope_pieces.index(self.executing_piece)\n", + " File \"/Users/shanest/opt/anaconda3/envs/corpus-filtering/lib/python3.9/site-packages/stack_data/utils.py\", line 144, in cached_property_wrapper\n", + " value = obj.__dict__[self.func.__name__] = self.func(obj)\n", + " File \"/Users/shanest/opt/anaconda3/envs/corpus-filtering/lib/python3.9/site-packages/stack_data/core.py\", line 660, in executing_piece\n", + " return only(\n", + " File \"/Users/shanest/opt/anaconda3/envs/corpus-filtering/lib/python3.9/site-packages/executing/executing.py\", line 116, in only\n", + " raise NotOneValueFound('Expected one value, found 0')\n", + "executing.executing.NotOneValueFound: Expected one value, found 0\n" + ] + }, + { + "data": { + "text/plain": [ + " corpus arch seed validation_loss validation_ppl test_loss \\\n", + "0 full transformer 0 3.841752 46.607061 3.845174 \n", + "1 full transformer 1 3.840523 46.549825 3.844156 \n", + "2 full transformer 2 3.840050 46.527789 3.843551 \n", + "3 full transformer 3 3.841511 46.595850 3.844735 \n", + "4 full transformer 4 3.842284 46.631870 3.846043 \n", + "... ... ... ... ... ... ... \n", + "10715 passive lstm 0 3.975108 53.255876 3.977766 \n", + "10716 passive lstm 1 3.975204 53.261006 3.978100 \n", + "10717 passive lstm 2 3.974197 53.207382 3.977306 \n", + "10718 passive lstm 3 3.975786 53.291974 3.978186 \n", + "10719 passive lstm 4 3.977474 53.382045 3.980730 \n", + "\n", + " test_ppl blimp_benchmark blimp_acc filter_target \\\n", + "0 46.766802 adjunct_island 0.726 False \n", + "1 46.719216 adjunct_island 0.654 False \n", + "2 46.690965 adjunct_island 0.668 False \n", + "3 46.746313 adjunct_island 0.814 False \n", + "4 46.807473 adjunct_island 0.665 False \n", + "... ... ... ... ... \n", + "10715 53.397600 wh_vs_that_with_gap_long_distance 0.137 False \n", + "10716 53.415426 wh_vs_that_with_gap_long_distance 0.164 False \n", + "10717 53.373047 wh_vs_that_with_gap_long_distance 0.121 False \n", + "10718 53.420024 wh_vs_that_with_gap_long_distance 0.091 False \n", + "10719 53.556119 wh_vs_that_with_gap_long_distance 0.125 False \n", + "\n", + " ... z_validation_loss z_test_ppl z_test_loss full_same_seed_acc \\\n", + "0 ... -1.096013 -1.089433 -1.096798 0.726 \n", + "1 ... -1.114756 -1.103922 -1.112395 0.654 \n", + "2 ... -1.121978 -1.112524 -1.121662 0.668 \n", + "3 ... -1.099682 -1.095671 -1.103512 0.814 \n", + "4 ... -1.087896 -1.077049 -1.083481 0.665 \n", + "... ... ... ... ... ... \n", + "10715 ... 0.938086 0.929552 0.934563 0.144 \n", + "10716 ... 0.939555 0.934980 0.939677 0.139 \n", + "10717 ... 0.924191 0.922076 0.927517 0.161 \n", + "10718 ... 0.948421 0.936380 0.940995 0.156 \n", + "10719 ... 0.974180 0.977819 0.979977 0.087 \n", + "\n", + " blimp_delta_same_seed full_all_seed_avg_acc blimp_delta field \\\n", + "0 0.000 0.7054 0.0206 syntax \n", + "1 0.000 0.7054 -0.0514 syntax \n", + "2 0.000 0.7054 -0.0374 syntax \n", + "3 0.000 0.7054 0.1086 syntax \n", + "4 0.000 0.7054 -0.0404 syntax \n", + "... ... ... ... ... \n", + "10715 -0.007 0.1374 -0.0004 syntax \n", + "10716 0.025 0.1374 0.0266 syntax \n", + "10717 -0.040 0.1374 -0.0164 syntax \n", + "10718 -0.065 0.1374 -0.0464 syntax \n", + "10719 0.038 0.1374 -0.0124 syntax \n", + "\n", + " linguistics_term corpus_tokens \n", + "0 island_effects 66442068 \n", + "1 island_effects 66442068 \n", + "2 island_effects 66442068 \n", + "3 island_effects 66442068 \n", + "4 island_effects 66442068 \n", + "... ... ... \n", + "10715 filler_gap_dependency 66155000 \n", + "10716 filler_gap_dependency 66155000 \n", + "10717 filler_gap_dependency 66155000 \n", + "10718 filler_gap_dependency 66155000 \n", + "10719 filler_gap_dependency 66155000 \n", + "\n", + "[10720 rows x 21 columns]" + ] + }, + "execution_count": 22, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "main_data = pd.read_csv(\"data/tidy_results.csv\", index_col=0)\n", + "main_data = main_data.rename(columns = {\"blimp_delta_all_seed_avg\": \"blimp_delta\"})\n", + "print(main_data.columns)\n", + "main_data" + ] + }, + { + "cell_type": "code", + "execution_count": 36, + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "arch\n", + "lstm 53.403396\n", + "transformer 46.970552\n", + "Name: validation_ppl, dtype: float64\n", + "TtestResult(statistic=271.4049864115043, pvalue=4.192319978874203e-29, df=15)\n" + ] + }, + { + "name": "stderr", + "output_type": "stream", + "text": [ + "/var/folders/7b/186zvw415gldw_ktxf30pyzc0000gn/T/ipykernel_33594/3779774374.py:9: FutureWarning: Using the level keyword in DataFrame and Series aggregations is deprecated and will be removed in a future version. Use groupby instead. df.median(level=1) should use df.groupby(level=1).median().\n", + " print(ppls_by_arch.mean(level=\"arch\"))\n" + ] + } + ], + "source": [ + "# group data by corpus and architecture, take the mean val ppl across seeds\n", + "ppls_by_arch = main_data.groupby([\"corpus\", \"arch\"])[\"validation_ppl\"].mean()\n", + "# get the difference between lstm and transformer for each corpus\n", + "ppl_unstacked = ppls_by_arch.unstack(level=\"arch\")\n", + "\n", + "# print the mean ppl for each architecture\n", + "print(ppls_by_arch.mean(level=\"arch\"))\n", + "print(scipy.stats.ttest_rel(ppl_unstacked[\"lstm\"], ppl_unstacked[\"transformer\"]))\n" + ] + } + ], + "metadata": { + "kernelspec": { + "display_name": "corpus-filtering", + "language": "python", + "name": "python3" + }, + "language_info": { + "codemirror_mode": { + "name": "ipython", + "version": 3 + }, + "file_extension": ".py", + "mimetype": "text/x-python", + "name": "python", + "nbconvert_exporter": "python", + "pygments_lexer": "ipython3", + "version": "3.9.16" + } + }, + "nbformat": 4, + "nbformat_minor": 2 +}