From e425c905284d223c67874bc7ae1a844bd9c52ef3 Mon Sep 17 00:00:00 2001
From: Shane Steinert-Threlkeld <ssshanest@gmail.com>
Date: Tue, 7 May 2024 13:39:25 -0700
Subject: [PATCH] add regression to notebook

---
 results/analysis.ipynb | 169 +++++++++++++++++++----------------------
 1 file changed, 79 insertions(+), 90 deletions(-)

diff --git a/results/analysis.ipynb b/results/analysis.ipynb
index 99a1050..487f6b4 100644
--- a/results/analysis.ipynb
+++ b/results/analysis.ipynb
@@ -2,7 +2,7 @@
  "cells": [
   {
    "cell_type": "code",
-   "execution_count": 1,
+   "execution_count": 2,
    "metadata": {},
    "outputs": [],
    "source": [
@@ -14,15 +14,15 @@
   },
   {
    "cell_type": "code",
-   "execution_count": 2,
+   "execution_count": 3,
    "metadata": {},
    "outputs": [
     {
      "name": "stdout",
      "output_type": "stream",
      "text": [
-      "{'binding-reconstruction', 'binding-domain', 'rel-cl', 'npi-sim-ques', 'binding-c-command', 'binding-case', 'passive', 'existential-there-quantifier', 'det-noun', 'npi-only', 'pp-mod-subj', 'full', 're-irr-sv-agr', 'det-adj-noun', 'npi-sent-neg', 'superlative-quantifier'}\n",
-      "{'sentential_negation_npi_scope', 'existential_there_quantifiers_1', 'matrix_question_npi_licensor_present', 'superlative_quantifiers_1', 'principle_A_case_1', 'principle_A_domain_1', 'principle_A_domain_3', 'principle_A_case_2', 'determiner_noun_agreement_with_adj_2', 'distractor_agreement_relative_clause', 'irregular_plural_subject_verb_agreement_2', 'determiner_noun_agreement_with_adj_irregular_1', 'determiner_noun_agreement_with_adjective_1', 'irregular_plural_subject_verb_agreement_1', 'regular_plural_subject_verb_agreement_2', 'sentential_negation_npi_licensor_present', 'principle_A_c_command', 'principle_A_reconstruction', 'superlative_quantifiers_2', 'determiner_noun_agreement_irregular_1', 'principle_A_domain_2', 'determiner_noun_agreement_1', 'only_npi_scope', 'distractor_agreement_relational_noun', 'passive_2', 'passive_1', 'determiner_noun_agreement_2', 'regular_plural_subject_verb_agreement_1', 'determiner_noun_agreement_with_adj_irregular_2', 'determiner_noun_agreement_irregular_2', 'only_npi_licensor_present'}\n"
+      "{'npi-sent-neg', 'npi-only', 'npi-sim-ques', 'existential-there-quantifier', 'pp-mod-subj', 'binding-c-command', 'binding-case', 'rel-cl', 'binding-reconstruction', 'det-noun', 'full', 'det-adj-noun', 'passive', 'binding-domain', 're-irr-sv-agr', 'superlative-quantifier'}\n",
+      "{'superlative_quantifiers_2', 'distractor_agreement_relative_clause', 'principle_A_c_command', 'distractor_agreement_relational_noun', 'regular_plural_subject_verb_agreement_1', 'irregular_plural_subject_verb_agreement_1', 'determiner_noun_agreement_with_adj_irregular_1', 'principle_A_case_2', 'sentential_negation_npi_scope', 'principle_A_domain_3', 'principle_A_case_1', 'determiner_noun_agreement_irregular_2', 'determiner_noun_agreement_with_adj_irregular_2', 'regular_plural_subject_verb_agreement_2', 'determiner_noun_agreement_with_adjective_1', 'matrix_question_npi_licensor_present', 'determiner_noun_agreement_1', 'only_npi_licensor_present', 'determiner_noun_agreement_irregular_1', 'determiner_noun_agreement_2', 'passive_2', 'existential_there_quantifiers_1', 'superlative_quantifiers_1', 'determiner_noun_agreement_with_adj_2', 'sentential_negation_npi_licensor_present', 'principle_A_domain_2', 'passive_1', 'principle_A_domain_1', 'irregular_plural_subject_verb_agreement_2', 'principle_A_reconstruction', 'only_npi_scope'}\n"
      ]
     }
    ],
@@ -85,7 +85,7 @@
   },
   {
    "cell_type": "code",
-   "execution_count": 3,
+   "execution_count": 4,
    "metadata": {},
    "outputs": [
     {
@@ -266,7 +266,7 @@
   },
   {
    "cell_type": "code",
-   "execution_count": 4,
+   "execution_count": 5,
    "metadata": {},
    "outputs": [],
    "source": [
@@ -276,7 +276,7 @@
   },
   {
    "cell_type": "code",
-   "execution_count": 5,
+   "execution_count": 6,
    "metadata": {},
    "outputs": [],
    "source": [
@@ -296,7 +296,7 @@
   },
   {
    "cell_type": "code",
-   "execution_count": 6,
+   "execution_count": 7,
    "metadata": {},
    "outputs": [
     {
@@ -324,6 +324,32 @@
     "print(cohen_d(ppl_unstacked[\"lstm\"], ppl_unstacked[\"transformer\"]))\n"
    ]
   },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "Correlation between ppl and corpus tokens by architecture:"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 11,
+   "metadata": {},
+   "outputs": [
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "PearsonRResult(statistic=-0.9694350791008717, pvalue=0.0)\n",
+      "PearsonRResult(statistic=-0.9755968743080783, pvalue=0.0)\n"
+     ]
+    }
+   ],
+   "source": [
+    "print(scipy.stats.pearsonr(main_data[main_data[\"arch\"]==\"lstm\"][\"corpus_tokens\"], main_data[main_data[\"arch\"]==\"lstm\"][\"test_ppl\"]))\n",
+    "print(scipy.stats.pearsonr(main_data[main_data[\"arch\"]==\"transformer\"][\"corpus_tokens\"], main_data[main_data[\"arch\"]==\"transformer\"][\"test_ppl\"]))"
+   ]
+  },
   {
    "cell_type": "markdown",
    "metadata": {},
@@ -340,7 +366,7 @@
   },
   {
    "cell_type": "code",
-   "execution_count": 7,
+   "execution_count": 8,
    "metadata": {},
    "outputs": [
     {
@@ -373,7 +399,7 @@
   },
   {
    "cell_type": "code",
-   "execution_count": 8,
+   "execution_count": 9,
    "metadata": {},
    "outputs": [
     {
@@ -401,104 +427,67 @@
    "cell_type": "markdown",
    "metadata": {},
    "source": [
-    "# Next steps"
+    "## Regression Analysis\n",
+    "\n",
+    "Here's an attempt at a regression that attempts to decipher what factors are and are not responsible for the accuracy deltas.   Let me know what you think and/or what I'm forgetting!"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "Rough summary: architecture _on its own_ is not significant!  Neither is _test perplexity_!  The only consistent factor is _filter-target_ and all of the interactions with it.  Does this make sense?  I'm also not sure this is the best specification of the predictors for the model.  I did random intercepts for each combination of corpus and benchmark; I found similar things when only doing random intercepts for corpus."
    ]
   },
   {
    "cell_type": "code",
-   "execution_count": 9,
+   "execution_count": 24,
    "metadata": {},
    "outputs": [
     {
      "name": "stdout",
      "output_type": "stream",
      "text": [
-      "            Mixed Linear Model Regression Results\n",
-      "=============================================================\n",
-      "Model:              MixedLM   Dependent Variable:   test_ppl \n",
-      "No. Observations:   10720     Method:               REML     \n",
-      "No. Groups:         2         Scale:                0.0268   \n",
-      "Min. group size:    5360      Log-Likelihood:       4158.2101\n",
-      "Max. group size:    5360      Converged:            Yes      \n",
-      "Mean group size:    5360.0                                   \n",
-      "-------------------------------------------------------------\n",
-      "               Coef.  Std.Err.    z     P>|z|  [0.025  0.975]\n",
-      "-------------------------------------------------------------\n",
-      "Intercept     110.442    2.130   51.849 0.000 106.267 114.617\n",
-      "corpus_tokens  -0.000    0.000 -419.795 0.000  -0.000  -0.000\n",
-      "Group Var       9.034   41.311                               \n",
-      "=============================================================\n",
-      "\n",
-      "                            OLS Regression Results                            \n",
-      "==============================================================================\n",
-      "Dep. Variable:               test_ppl   R-squared:                       0.998\n",
-      "Model:                            OLS   Adj. R-squared:                  0.998\n",
-      "Method:                 Least Squares   F-statistic:                 1.499e+06\n",
-      "Date:                Tue, 23 Apr 2024   Prob (F-statistic):               0.00\n",
-      "Time:                        11:35:29   Log-Likelihood:                 4427.0\n",
-      "No. Observations:               10720   AIC:                            -8846.\n",
-      "Df Residuals:                   10716   BIC:                            -8817.\n",
-      "Df Model:                           3                                         \n",
-      "Covariance Type:            nonrobust                                         \n",
-      "=====================================================================================================\n",
-      "                                        coef    std err          t      P>|t|      [0.025      0.975]\n",
-      "-----------------------------------------------------------------------------------------------------\n",
-      "Intercept                           116.7298      0.198    589.364      0.000     116.342     117.118\n",
-      "arch[T.transformer]                 -12.5748      0.280    -44.894      0.000     -13.124     -12.026\n",
-      "corpus_tokens                     -9.572e-07      3e-09   -318.976      0.000   -9.63e-07   -9.51e-07\n",
-      "corpus_tokens:arch[T.transformer]  9.319e-08   4.24e-09     21.959      0.000    8.49e-08    1.02e-07\n",
-      "==============================================================================\n",
-      "Omnibus:                     2121.682   Durbin-Watson:                   0.697\n",
-      "Prob(Omnibus):                  0.000   Jarque-Bera (JB):             5421.708\n",
-      "Skew:                          -1.083   Prob(JB):                         0.00\n",
-      "Kurtosis:                       5.729   Cond. No.                     1.56e+10\n",
-      "==============================================================================\n",
-      "\n",
-      "Notes:\n",
-      "[1] Standard Errors assume that the covariance matrix of the errors is correctly specified.\n",
-      "[2] The condition number is large, 1.56e+10. This might indicate that there are\n",
-      "strong multicollinearity or other numerical problems.\n"
+      "                            Mixed Linear Model Regression Results\n",
+      "=============================================================================================\n",
+      "Model:                        MixedLM             Dependent Variable:             blimp_delta\n",
+      "No. Observations:             10720               Method:                         REML       \n",
+      "No. Groups:                   16                  Scale:                          0.0014     \n",
+      "Min. group size:              670                 Log-Likelihood:                 19977.0387 \n",
+      "Max. group size:              670                 Converged:                      Yes        \n",
+      "Mean group size:              670.0                                                          \n",
+      "---------------------------------------------------------------------------------------------\n",
+      "                                                   Coef.  Std.Err.   z    P>|z| [0.025 0.975]\n",
+      "---------------------------------------------------------------------------------------------\n",
+      "Intercept                                           0.365    0.438  0.833 0.405 -0.494  1.225\n",
+      "filter_target[T.True]                               2.492    0.272  9.169 0.000  1.959  3.025\n",
+      "arch[T.transformer]                                 0.065    0.054  1.215 0.224 -0.040  0.171\n",
+      "filter_target[T.True]:arch[T.transformer]          -1.987    0.377 -5.265 0.000 -2.726 -1.247\n",
+      "corpus_tokens                                      -0.000    0.000 -0.721 0.471 -0.000  0.000\n",
+      "test_ppl                                           -0.004    0.004 -0.958 0.338 -0.011  0.004\n",
+      "test_ppl:filter_target[T.True]                     -0.047    0.005 -9.316 0.000 -0.057 -0.037\n",
+      "test_ppl:arch[T.transformer]                       -0.002    0.001 -1.535 0.125 -0.004  0.000\n",
+      "test_ppl:filter_target[T.True]:arch[T.transformer]  0.036    0.008  4.733 0.000  0.021  0.050\n",
+      "Group Var                                           0.000    0.000                           \n",
+      "=============================================================================================\n",
+      "\n"
      ]
     },
     {
-     "ename": "AttributeError",
-     "evalue": "'MixedLMResults' object has no attribute 'ssr'",
-     "output_type": "error",
-     "traceback": [
-      "\u001b[0;31m---------------------------------------------------------------------------\u001b[0m",
-      "\u001b[0;31mAttributeError\u001b[0m                            Traceback (most recent call last)",
-      "Cell \u001b[0;32mIn[9], line 9\u001b[0m\n\u001b[1;32m      6\u001b[0m fixed_test_ppl_result \u001b[38;5;241m=\u001b[39m fixed_test_ppl_model\u001b[38;5;241m.\u001b[39mfit()\n\u001b[1;32m      7\u001b[0m \u001b[38;5;28mprint\u001b[39m(fixed_test_ppl_result\u001b[38;5;241m.\u001b[39msummary())\n\u001b[0;32m----> 9\u001b[0m \u001b[38;5;28mprint\u001b[39m(\u001b[43msm\u001b[49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43mstats\u001b[49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43manova_lm\u001b[49m\u001b[43m(\u001b[49m\u001b[43mfixed_test_ppl_result\u001b[49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[43mmixed_test_ppl_result\u001b[49m\u001b[43m)\u001b[49m)\n\u001b[1;32m     11\u001b[0m lstm_test_ppl_model \u001b[38;5;241m=\u001b[39m smf\u001b[38;5;241m.\u001b[39mols(\u001b[38;5;124m\"\u001b[39m\u001b[38;5;124mtest_ppl ~ corpus_tokens\u001b[39m\u001b[38;5;124m\"\u001b[39m, main_data[main_data[\u001b[38;5;124m\"\u001b[39m\u001b[38;5;124march\u001b[39m\u001b[38;5;124m\"\u001b[39m] \u001b[38;5;241m==\u001b[39m \u001b[38;5;124m\"\u001b[39m\u001b[38;5;124mlstm\u001b[39m\u001b[38;5;124m\"\u001b[39m])\n\u001b[1;32m     12\u001b[0m lstm_test_ppl_result \u001b[38;5;241m=\u001b[39m lstm_test_ppl_model\u001b[38;5;241m.\u001b[39mfit()\n",
-      "File \u001b[0;32m~/opt/anaconda3/envs/corpus-filtering/lib/python3.9/site-packages/statsmodels/stats/anova.py:369\u001b[0m, in \u001b[0;36manova_lm\u001b[0;34m(*args, **kwargs)\u001b[0m\n\u001b[1;32m    366\u001b[0m \u001b[38;5;28;01mif\u001b[39;00m \u001b[38;5;129;01mnot\u001b[39;00m scale: \u001b[38;5;66;03m# assume biggest model is last\u001b[39;00m\n\u001b[1;32m    367\u001b[0m     scale \u001b[38;5;241m=\u001b[39m args[\u001b[38;5;241m-\u001b[39m\u001b[38;5;241m1\u001b[39m]\u001b[38;5;241m.\u001b[39mscale\n\u001b[0;32m--> 369\u001b[0m table[\u001b[38;5;124m\"\u001b[39m\u001b[38;5;124mssr\u001b[39m\u001b[38;5;124m\"\u001b[39m] \u001b[38;5;241m=\u001b[39m [mdl\u001b[38;5;241m.\u001b[39mssr \u001b[38;5;28;01mfor\u001b[39;00m mdl \u001b[38;5;129;01min\u001b[39;00m args]\n\u001b[1;32m    370\u001b[0m table[\u001b[38;5;124m\"\u001b[39m\u001b[38;5;124mdf_resid\u001b[39m\u001b[38;5;124m\"\u001b[39m] \u001b[38;5;241m=\u001b[39m [mdl\u001b[38;5;241m.\u001b[39mdf_resid \u001b[38;5;28;01mfor\u001b[39;00m mdl \u001b[38;5;129;01min\u001b[39;00m args]\n\u001b[1;32m    371\u001b[0m table\u001b[38;5;241m.\u001b[39mloc[table\u001b[38;5;241m.\u001b[39mindex[\u001b[38;5;241m1\u001b[39m:], \u001b[38;5;124m\"\u001b[39m\u001b[38;5;124mdf_diff\u001b[39m\u001b[38;5;124m\"\u001b[39m] \u001b[38;5;241m=\u001b[39m \u001b[38;5;241m-\u001b[39mnp\u001b[38;5;241m.\u001b[39mdiff(table[\u001b[38;5;124m\"\u001b[39m\u001b[38;5;124mdf_resid\u001b[39m\u001b[38;5;124m\"\u001b[39m]\u001b[38;5;241m.\u001b[39mvalues)\n",
-      "File \u001b[0;32m~/opt/anaconda3/envs/corpus-filtering/lib/python3.9/site-packages/statsmodels/stats/anova.py:369\u001b[0m, in \u001b[0;36m<listcomp>\u001b[0;34m(.0)\u001b[0m\n\u001b[1;32m    366\u001b[0m \u001b[38;5;28;01mif\u001b[39;00m \u001b[38;5;129;01mnot\u001b[39;00m scale: \u001b[38;5;66;03m# assume biggest model is last\u001b[39;00m\n\u001b[1;32m    367\u001b[0m     scale \u001b[38;5;241m=\u001b[39m args[\u001b[38;5;241m-\u001b[39m\u001b[38;5;241m1\u001b[39m]\u001b[38;5;241m.\u001b[39mscale\n\u001b[0;32m--> 369\u001b[0m table[\u001b[38;5;124m\"\u001b[39m\u001b[38;5;124mssr\u001b[39m\u001b[38;5;124m\"\u001b[39m] \u001b[38;5;241m=\u001b[39m [\u001b[43mmdl\u001b[49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43mssr\u001b[49m \u001b[38;5;28;01mfor\u001b[39;00m mdl \u001b[38;5;129;01min\u001b[39;00m args]\n\u001b[1;32m    370\u001b[0m table[\u001b[38;5;124m\"\u001b[39m\u001b[38;5;124mdf_resid\u001b[39m\u001b[38;5;124m\"\u001b[39m] \u001b[38;5;241m=\u001b[39m [mdl\u001b[38;5;241m.\u001b[39mdf_resid \u001b[38;5;28;01mfor\u001b[39;00m mdl \u001b[38;5;129;01min\u001b[39;00m args]\n\u001b[1;32m    371\u001b[0m table\u001b[38;5;241m.\u001b[39mloc[table\u001b[38;5;241m.\u001b[39mindex[\u001b[38;5;241m1\u001b[39m:], \u001b[38;5;124m\"\u001b[39m\u001b[38;5;124mdf_diff\u001b[39m\u001b[38;5;124m\"\u001b[39m] \u001b[38;5;241m=\u001b[39m \u001b[38;5;241m-\u001b[39mnp\u001b[38;5;241m.\u001b[39mdiff(table[\u001b[38;5;124m\"\u001b[39m\u001b[38;5;124mdf_resid\u001b[39m\u001b[38;5;124m\"\u001b[39m]\u001b[38;5;241m.\u001b[39mvalues)\n",
-      "File \u001b[0;32m~/opt/anaconda3/envs/corpus-filtering/lib/python3.9/site-packages/statsmodels/base/wrapper.py:34\u001b[0m, in \u001b[0;36mResultsWrapper.__getattribute__\u001b[0;34m(self, attr)\u001b[0m\n\u001b[1;32m     31\u001b[0m \u001b[38;5;28;01mexcept\u001b[39;00m \u001b[38;5;167;01mAttributeError\u001b[39;00m:\n\u001b[1;32m     32\u001b[0m     \u001b[38;5;28;01mpass\u001b[39;00m\n\u001b[0;32m---> 34\u001b[0m obj \u001b[38;5;241m=\u001b[39m \u001b[38;5;28;43mgetattr\u001b[39;49m\u001b[43m(\u001b[49m\u001b[43mresults\u001b[49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[43mattr\u001b[49m\u001b[43m)\u001b[49m\n\u001b[1;32m     35\u001b[0m data \u001b[38;5;241m=\u001b[39m results\u001b[38;5;241m.\u001b[39mmodel\u001b[38;5;241m.\u001b[39mdata\n\u001b[1;32m     36\u001b[0m how \u001b[38;5;241m=\u001b[39m \u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39m_wrap_attrs\u001b[38;5;241m.\u001b[39mget(attr)\n",
-      "\u001b[0;31mAttributeError\u001b[0m: 'MixedLMResults' object has no attribute 'ssr'"
+     "name": "stderr",
+     "output_type": "stream",
+     "text": [
+      "/Users/shanest/opt/anaconda3/envs/corpus-filtering/lib/python3.9/site-packages/statsmodels/regression/mixed_linear_model.py:2238: ConvergenceWarning: The MLE may be on the boundary of the parameter space.\n",
+      "  warnings.warn(msg, ConvergenceWarning)\n"
      ]
     }
    ],
    "source": [
-    "mixed_test_ppl_model = smf.mixedlm(\"test_ppl ~ corpus_tokens\", main_data, groups=main_data[\"arch\"])\n",
-    "mixed_test_ppl_result = mixed_test_ppl_model.fit()\n",
-    "print(mixed_test_ppl_result.summary())\n",
-    "\n",
-    "fixed_test_ppl_model = smf.ols(\"test_ppl ~ corpus_tokens + arch + corpus_tokens*arch\", main_data)\n",
-    "fixed_test_ppl_result = fixed_test_ppl_model.fit()\n",
-    "print(fixed_test_ppl_result.summary())\n",
-    "\n",
-    "print(sm.stats.anova_lm(fixed_test_ppl_result, mixed_test_ppl_result))\n",
-    "\n",
-    "lstm_test_ppl_model = smf.ols(\"test_ppl ~ corpus_tokens\", main_data[main_data[\"arch\"] == \"lstm\"])\n",
-    "lstm_test_ppl_result = lstm_test_ppl_model.fit()\n",
-    "print(lstm_test_ppl_result.summary())\n",
-    "\n",
-    "print(scipy.stats.pearsonr(main_data[main_data[\"arch\"]==\"lstm\"][\"corpus_tokens\"], main_data[main_data[\"arch\"]==\"lstm\"][\"test_ppl\"]))"
+    "main_data[\"corpus-and-benchmark\"] = main_data[\"corpus\"] + \"-\" + main_data[\"blimp_benchmark\"]\n",
+    "full_regression = smf.mixedlm(\"blimp_delta ~ corpus_tokens + test_ppl*filter_target*arch\", main_data, groups=main_data[\"corpus-and-benchmark\"])\n",
+    "full_regression_result = full_regression.fit()\n",
+    "print(full_regression_result.summary())\n"
    ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": null,
-   "metadata": {},
-   "outputs": [],
-   "source": []
   }
  ],
  "metadata": {