{
 "cells": [
  {
   "cell_type": "code",
   "execution_count": 13,
   "metadata": {},
   "outputs": [],
   "source": [
    "import numpy as np\n",
    "import pandas as pd\n",
    "import matplotlib.pyplot as plt\n",
    "%matplotlib inline\n",
    "from pprint import pprint\n",
    "from collections import OrderedDict\n",
    "import scipy.stats\n"
   ]
  },
  {
   "attachments": {},
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "# Load human evaluation results"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 14,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/html": [
       "<div>\n",
       "<style scoped>\n",
       "    .dataframe tbody tr th:only-of-type {\n",
       "        vertical-align: middle;\n",
       "    }\n",
       "\n",
       "    .dataframe tbody tr th {\n",
       "        vertical-align: top;\n",
       "    }\n",
       "\n",
       "    .dataframe thead th {\n",
       "        text-align: right;\n",
       "    }\n",
       "</style>\n",
       "<table border=\"1\" class=\"dataframe\">\n",
       "  <thead>\n",
       "    <tr style=\"text-align: right;\">\n",
       "      <th></th>\n",
       "      <th>mauve</th>\n",
       "    </tr>\n",
       "  </thead>\n",
       "  <tbody>\n",
       "    <tr>\n",
       "      <th>('gpt2', 'p0.9')</th>\n",
       "      <td>0.193757</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>('gpt2', 'p1.0')</th>\n",
       "      <td>0.194512</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>('gpt2-large', 'p0.95')</th>\n",
       "      <td>0.192980</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>('gpt2-large', 'p1.0')</th>\n",
       "      <td>0.195805</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>('gpt2-medium', 'p0.9')</th>\n",
       "      <td>0.194425</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>('gpt2-medium', 'p1.0')</th>\n",
       "      <td>0.196553</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>('gpt2-xl', 'p0.95')</th>\n",
       "      <td>0.191461</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>('gpt2-xl', 'p1.0')</th>\n",
       "      <td>0.199148</td>\n",
       "    </tr>\n",
       "  </tbody>\n",
       "</table>\n",
       "</div>"
      ],
      "text/plain": [
       "                            mauve\n",
       "('gpt2', 'p0.9')         0.193757\n",
       "('gpt2', 'p1.0')         0.194512\n",
       "('gpt2-large', 'p0.95')  0.192980\n",
       "('gpt2-large', 'p1.0')   0.195805\n",
       "('gpt2-medium', 'p0.9')  0.194425\n",
       "('gpt2-medium', 'p1.0')  0.196553\n",
       "('gpt2-xl', 'p0.95')     0.191461\n",
       "('gpt2-xl', 'p1.0')      0.199148"
      ]
     },
     "execution_count": 14,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "# Numbers reported in the paper\n",
    "# mauve_scores_raw = {\n",
    "#       \"('gpt2', 'p0.9')\": 0.476346667,\n",
    "#       \"('gpt2', 'p1.0')\": 0.468094444,\n",
    "#       \"('gpt2-large', 'p0.95')\": 0.480351111,\n",
    "#       \"('gpt2-large', 'p1.0')\": 0.472142222,\n",
    "#       \"('gpt2-medium', 'p0.9')\": 0.477806667,\n",
    "#       \"('gpt2-medium', 'p1.0')\": 0.462046667,\n",
    "#       \"('gpt2-xl', 'p0.95')\":0.481145556,\n",
    "#       \"('gpt2-xl', 'p1.0')\": 0.472241111}\n",
    "# mauve_scores_raw = {\n",
    "#       \"('gpt2', 'p0.9')\": 0.816555203,\n",
    "#       \"('gpt2', 'p1.0')\": 0.815964738,\n",
    "#       \"('gpt2-large', 'p0.95')\": 0.818678255,\n",
    "#       \"('gpt2-large', 'p1.0')\": 0.813630059,\n",
    "#       \"('gpt2-medium', 'p0.9')\": 0.815414144,\n",
    "#       \"('gpt2-medium', 'p1.0')\": 0.812838286,\n",
    "#       \"('gpt2-xl', 'p0.95')\": 0.821078288,\n",
    "#       \"('gpt2-xl', 'p1.0')\": 0.806571319}\n",
    "# mauve_scores_raw = {\n",
    "#       \"('gpt2', 'p0.9')\": 0.193757143,\n",
    "#       \"('gpt2', 'p1.0')\": 0.19451222,\n",
    "#       \"('gpt2-large', 'p0.95')\": 0.192980018,\n",
    "#       \"('gpt2-large', 'p1.0')\": 0.195804673,\n",
    "#       \"('gpt2-medium', 'p0.9')\": 0.194424819,\n",
    "#       \"('gpt2-medium', 'p1.0')\": 0.196552829,\n",
    "#       \"('gpt2-xl', 'p0.95')\": 0.191460869,\n",
    "#       \"('gpt2-xl', 'p1.0')\": 0.199148149}\n",
    "\n",
    "mauve_scores_raw = {\n",
    "      \"('gpt2', 'p0.9')\": 0.359315027,\n",
    "      \"('gpt2', 'p1.0')\": 0.318698191,\n",
    "      \"('gpt2-large', 'p0.95')\": 0.355567282,\n",
    "      \"('gpt2-large', 'p1.0')\": 0.337723148,\n",
    "      \"('gpt2-medium', 'p0.9')\": 0.357929565,\n",
    "      \"('gpt2-medium', 'p1.0')\": 0.309786716,\n",
    "      \"('gpt2-xl', 'p0.95')\": 0.359468025,\n",
    "      \"('gpt2-xl', 'p1.0')\": 0.331293712}\n",
    "\n",
    "mauve_scores = pd.Series(mauve_scores_raw, name=\"mauve\")\n",
    "\n",
    "mauve_scores.to_frame()"
   ]
  },
  {
   "attachments": {},
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "Download the raw output file `mauve-human-eval-anon.csv` from [here](https://github.com/krishnap25/mauve-experiments/blob/main/human_evaluation.md) and place it in the same folder as this notebook. "
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 15,
   "metadata": {},
   "outputs": [],
   "source": [
    "results_fn = 'mauve-human-eval-anon.csv' \n"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 16,
   "metadata": {
    "scrolled": true
   },
   "outputs": [
    {
     "data": {
      "text/html": [
       "<div>\n",
       "<style scoped>\n",
       "    .dataframe tbody tr th:only-of-type {\n",
       "        vertical-align: middle;\n",
       "    }\n",
       "\n",
       "    .dataframe tbody tr th {\n",
       "        vertical-align: top;\n",
       "    }\n",
       "\n",
       "    .dataframe thead th {\n",
       "        text-align: right;\n",
       "    }\n",
       "</style>\n",
       "<table border=\"1\" class=\"dataframe\">\n",
       "  <thead>\n",
       "    <tr style=\"text-align: right;\">\n",
       "      <th></th>\n",
       "      <th>HITId</th>\n",
       "      <th>WorkerId</th>\n",
       "      <th>WorkTimeInSeconds</th>\n",
       "      <th>Input.idx</th>\n",
       "      <th>Input.model_b</th>\n",
       "      <th>Input.model_a</th>\n",
       "      <th>Input.ctx</th>\n",
       "      <th>Input.completionb</th>\n",
       "      <th>Input.completiona</th>\n",
       "      <th>Input.len_b</th>\n",
       "      <th>Input.len_a</th>\n",
       "      <th>Answer.q1</th>\n",
       "      <th>Answer.q2</th>\n",
       "      <th>Answer.q3</th>\n",
       "      <th>Answer.te</th>\n",
       "    </tr>\n",
       "  </thead>\n",
       "  <tbody>\n",
       "    <tr>\n",
       "      <th>0</th>\n",
       "      <td>0</td>\n",
       "      <td>W196</td>\n",
       "      <td>2040</td>\n",
       "      <td>4848</td>\n",
       "      <td>('gpt2', 'p0.9')</td>\n",
       "      <td>('gpt2-xl', 'p0.95')</td>\n",
       "      <td>&lt;p&gt;&lt;strong&gt;Cost segregation benefits multifami...</td>\n",
       "      <td>&lt;p&gt;increasing the cost of property when proper...</td>\n",
       "      <td>&lt;p&gt;decreasing personal property taxes for cert...</td>\n",
       "      <td>1024</td>\n",
       "      <td>545</td>\n",
       "      <td>1a</td>\n",
       "      <td>1a</td>\n",
       "      <td>1b</td>\n",
       "      <td>28.109</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>1</th>\n",
       "      <td>1</td>\n",
       "      <td>W132</td>\n",
       "      <td>1606</td>\n",
       "      <td>3352</td>\n",
       "      <td>('gpt2', 'p1.0')</td>\n",
       "      <td>('gpt2-large', 'p1.0')</td>\n",
       "      <td>&lt;p&gt;&lt;strong&gt;Endpoint for getting Luas (Dublin l...</td>\n",
       "      <td>&lt;p&gt;(Dublin light rail).&lt;/p&gt;&lt;p&gt;Added &amp;quot;subs...</td>\n",
       "      <td>&lt;p&gt;from Dublin and Dublin City using the Verti...</td>\n",
       "      <td>1022</td>\n",
       "      <td>1021</td>\n",
       "      <td>1a</td>\n",
       "      <td>1a</td>\n",
       "      <td>2a</td>\n",
       "      <td>105.285</td>\n",
       "    </tr>\n",
       "  </tbody>\n",
       "</table>\n",
       "</div>"
      ],
      "text/plain": [
       "   HITId WorkerId  WorkTimeInSeconds  Input.idx     Input.model_b  \\\n",
       "0      0     W196               2040       4848  ('gpt2', 'p0.9')   \n",
       "1      1     W132               1606       3352  ('gpt2', 'p1.0')   \n",
       "\n",
       "            Input.model_a                                          Input.ctx  \\\n",
       "0    ('gpt2-xl', 'p0.95')  <p><strong>Cost segregation benefits multifami...   \n",
       "1  ('gpt2-large', 'p1.0')  <p><strong>Endpoint for getting Luas (Dublin l...   \n",
       "\n",
       "                                   Input.completionb  \\\n",
       "0  <p>increasing the cost of property when proper...   \n",
       "1  <p>(Dublin light rail).</p><p>Added &quot;subs...   \n",
       "\n",
       "                                   Input.completiona  Input.len_b  \\\n",
       "0  <p>decreasing personal property taxes for cert...         1024   \n",
       "1  <p>from Dublin and Dublin City using the Verti...         1022   \n",
       "\n",
       "   Input.len_a Answer.q1 Answer.q2 Answer.q3  Answer.te  \n",
       "0          545        1a        1a        1b     28.109  \n",
       "1         1021        1a        1a        2a    105.285  "
      ]
     },
     "execution_count": 16,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "df0 = pd.read_csv(results_fn, index_col=0)\n",
    "\n",
    "df0.head(2)"
   ]
  },
  {
   "attachments": {},
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "\n",
    "The columns in the CSV files are: \n",
    "- `HITId`: Integer indexing the row \n",
    "- `WorkerId`: Unique identifier of the crowd-worker \n",
    "- `WorkTimeInSeconds`: Amount of time the HIT was open on AMT\n",
    "- `Input.idx`: Index of the prompt \n",
    "- `Input.ctx`: Context/prompt that each completion is based upon\n",
    "- `Input.model_a`: Name of player A\n",
    "- `Input.completiona`: Completion generated by player A\n",
    "- `Input.len_a`: Total length (prompt + generation) of player A's text\n",
    "- `Input.model_b`: Name of player B\n",
    "- `Input.completionb`: Completion generated by player B\n",
    "- `Input.len_b`: Total length (prompt + generation) of player B's text\n",
    "- `Answer.q1`:  Answer of crowd-worker to the question: \"Which continuation is more interesting or creative, given the context?\"\n",
    "- `Answer.q2`: Answer of crowd-worker to the question: \"Which continuation makes more sense, given the context?\"\n",
    "- `Answer.q3`: Answer of crowd-worker to the question: \"Which continuation is more likely to be written by a human?\" \n",
    "- `Answer.te`: Our (pessimistic) estimate of the amount of time the crowd-worker took to answer the question.\n",
    "\n",
    "\n",
    "Key to `Answer.q*` fields: The responses of the crowd-workers to each question is stored with the following key: \n",
    "- Definitely A: 2a\n",
    "- Slightly A: 1a\n",
    "- Tie: 1a \n",
    "- Slightly B: 1b\n",
    "- Definitely B: 2b\n",
    "\n",
    "Note that both \"Tie\" and \"Slightly A\" are recorded as `1a`. Since for each pair, the choice of A versus B is randomized, this amounts to randomly assigning each tie as a win to one of the two players. \n"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 17,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "[\"('gpt2', 'p0.9')\" \"('gpt2', 'p1.0')\" \"('gpt2-large', 'p0.95')\"\n",
      " \"('gpt2-large', 'p1.0')\" \"('gpt2-medium', 'p0.9')\"\n",
      " \"('gpt2-medium', 'p1.0')\" \"('gpt2-xl', 'p0.95')\" \"('gpt2-xl', 'p1.0')\"\n",
      " 'human']\n"
     ]
    }
   ],
   "source": [
    "player_names = np.array(list(mauve_scores_raw.keys()) + [\"human\"])\n",
    "print(player_names)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 18,
   "metadata": {},
   "outputs": [],
   "source": [
    "def process_field_name(field_name):\n",
    "    if 'q1' in field_name:\n",
    "        final_name = 'Interesting'\n",
    "    elif 'q2' in field_name:\n",
    "        final_name = 'Sensible'\n",
    "    elif 'q3' in field_name:\n",
    "        final_name = 'Human-like'\n",
    "    else:\n",
    "        raise ValueError(f'Unknown name: {field_name}')\n",
    "    return final_name"
   ]
  },
  {
   "attachments": {},
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "# Bradley-Terry Scores: Implementation"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 19,
   "metadata": {},
   "outputs": [],
   "source": [
    "player_name_to_idx = OrderedDict(enumerate(player_names))"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 20,
   "metadata": {},
   "outputs": [],
   "source": [
    "def get_model1_v_model2(results, model1, model2):\n",
    "    df1 = results[(results['model1'] == model1) & (results['model2'] == model2)]\n",
    "    df2 = results[(results['model2'] == model1) & (results['model1'] == model2)]\n",
    "    m1_better = df1['m1 better'].sum() + df2['m2 better'].sum()\n",
    "    m2_better = df1['m2 better'].sum() + df2['m1 better'].sum()\n",
    "    return m1_better, m2_better"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 21,
   "metadata": {},
   "outputs": [],
   "source": [
    "def get_head2head_and_BT_rank(field_name='Answer.q3', threshold_time=25, max_iterations=1000):\n",
    "    df = df0.copy()[df0['Answer.te'] > threshold_time]   # Filter all responses made under `threshold_time`\n",
    "        \n",
    "    # Collect head2head numbers from the results dataframe\n",
    "    # Account for randomization of model_a versus model_b for the human eval\n",
    "    results = []\n",
    "    for i, m1 in enumerate(player_names):\n",
    "        for j, m2 in enumerate(player_names): \n",
    "            if i <= j: \n",
    "                continue\n",
    "            df1 = df[(df['Input.model_a'] == m1) & (df['Input.model_b'] == m2)]\n",
    "            df2 = df[(df['Input.model_b'] == m1) & (df['Input.model_a'] == m2)]\n",
    "            total = df1.shape[0] + df2.shape[0]\n",
    "            if total == 0: continue\n",
    "            m1_better = df1[df1[field_name].isin(['1a', '2a'])].shape[0] + df2[df2[field_name].isin(['1b', '2b'])].shape[0]\n",
    "            m2_better = df2[df2[field_name].isin(['1a', '2a'])].shape[0] + df1[df1[field_name].isin(['1b', '2b'])].shape[0]\n",
    "            tie = df1[df1[field_name] == '0'].shape[0] + df2[df2[field_name] == '0'].shape[0]\n",
    "            res = OrderedDict([('model1', m1), ('model2', m2), ('m1 better', m1_better), ('m2 better', m2_better),\n",
    "                              ('m1 frac', m1_better/total), ('m2 frac', m2_better/total)\n",
    "                              ])\n",
    "            results.append(res)\n",
    "    results = pd.DataFrame(results)  \n",
    "    \n",
    "    # Compute B-T preprocessing: collect the head-to-head\n",
    "    all_results = np.zeros((player_names.shape[0], player_names.shape[0]), dtype=np.int)  # head-to-head\n",
    "    wins_per_model = np.zeros(player_names.shape[0], dtype=np.int)  # total #wins per model\n",
    "    \n",
    "    for i, m1 in player_name_to_idx.items():\n",
    "        total = 0\n",
    "        for j, m2 in player_name_to_idx.items():\n",
    "            if m1 != m2:\n",
    "                t = get_model1_v_model2(results, m1, m2)[0]  # m1 better than m2\n",
    "                all_results[i, j] = t\n",
    "                total += t\n",
    "        wins_per_model[i] = total\n",
    "        \n",
    "    # Compute B-T probs\n",
    "    ps = np.random.rand(player_names.shape[0])\n",
    "    ps /= ps.sum()\n",
    "    qs = np.zeros_like(ps)\n",
    "\n",
    "    # Run iterations of Zeremelo's algorithm. See e.g. for details: \n",
    "    # https://en.wikipedia.org/wiki/Bradley%E2%80%93Terry_model\n",
    "    for iteration in range(max_iterations):\n",
    "        for i in range(player_names.shape[0]):\n",
    "            denom = sum([(all_results[i, j] + all_results[j, i]) / (ps[i] + ps[j]) \n",
    "                         for j in range(player_names.shape[0]) if i != j])\n",
    "            qs[i] = wins_per_model[i] / denom \n",
    "        ps_new = qs / qs.sum()\n",
    "        if np.linalg.norm(ps_new - ps, 1) < 1e-16:\n",
    "            # Algorithm converged\n",
    "            break\n",
    "        ps = ps_new\n",
    "    \n",
    "    # Convert `ps` into logspace and scale them as described in Appendix E.2 of\n",
    "    # the [paper](https://arxiv.org/pdf/2102.01454.pdf).\n",
    "    ps = np.log(ps)\n",
    "    ps -= ps.mean()\n",
    "    ps *= 100\n",
    "    \n",
    "    # Clean up the output\n",
    "    final_name = process_field_name(field_name)\n",
    "    out = pd.Series(dict(zip(player_name_to_idx.values(), ps)), name=f'BT/{final_name}')\n",
    "    return out.sort_values(ascending=False)"
   ]
  },
  {
   "attachments": {},
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "# Obtain the Bradley-Terry scores \n",
    "\n",
    "We discard all annotations made under `THRESHOLD_TIME=25` seconds for quality control using our pessimistic estimates obtained from the field `Answer.te`. \n",
    "See p. 29 of the paper under the heading \"Quality Control\" for details. \n",
    "\n",
    "The correlations are reported in Table 5 (and Table 14) of the paper. \n",
    "The raw Bradley-Terry scores are reported in Table 13. "
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 22,
   "metadata": {},
   "outputs": [],
   "source": [
    "THRESHOLD_TIME = 25"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 23,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "Correlation = SpearmanrResult(correlation=-0.523809523809524, pvalue=0.18272075053971484)\n"
     ]
    },
    {
     "name": "stderr",
     "output_type": "stream",
     "text": [
      "/Users/james/opt/anaconda3/envs/graphdl/lib/python3.7/site-packages/ipykernel_launcher.py:25: DeprecationWarning: `np.int` is a deprecated alias for the builtin `int`. To silence this warning, use `int` by itself. Doing this will not modify any behavior and is safe. When replacing `np.int`, you may wish to use e.g. `np.int64` or `np.int32` to specify the precision. If you wish to review your current use, check the release note link for additional information.\n",
      "Deprecated in NumPy 1.20; for more details and guidance: https://numpy.org/devdocs/release/1.20.0-notes.html#deprecations\n",
      "/Users/james/opt/anaconda3/envs/graphdl/lib/python3.7/site-packages/ipykernel_launcher.py:26: DeprecationWarning: `np.int` is a deprecated alias for the builtin `int`. To silence this warning, use `int` by itself. Doing this will not modify any behavior and is safe. When replacing `np.int`, you may wish to use e.g. `np.int64` or `np.int32` to specify the precision. If you wish to review your current use, check the release note link for additional information.\n",
      "Deprecated in NumPy 1.20; for more details and guidance: https://numpy.org/devdocs/release/1.20.0-notes.html#deprecations\n"
     ]
    },
    {
     "data": {
      "text/html": [
       "<div>\n",
       "<style scoped>\n",
       "    .dataframe tbody tr th:only-of-type {\n",
       "        vertical-align: middle;\n",
       "    }\n",
       "\n",
       "    .dataframe tbody tr th {\n",
       "        vertical-align: top;\n",
       "    }\n",
       "\n",
       "    .dataframe thead th {\n",
       "        text-align: right;\n",
       "    }\n",
       "</style>\n",
       "<table border=\"1\" class=\"dataframe\">\n",
       "  <thead>\n",
       "    <tr style=\"text-align: right;\">\n",
       "      <th></th>\n",
       "      <th>BT/Human-like</th>\n",
       "    </tr>\n",
       "  </thead>\n",
       "  <tbody>\n",
       "    <tr>\n",
       "      <th>human</th>\n",
       "      <td>47.251038</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>('gpt2-xl', 'p0.95')</th>\n",
       "      <td>15.663819</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>('gpt2-large', 'p0.95')</th>\n",
       "      <td>12.552649</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>('gpt2-xl', 'p1.0')</th>\n",
       "      <td>8.966299</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>('gpt2-medium', 'p0.9')</th>\n",
       "      <td>-3.429421</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>('gpt2-large', 'p1.0')</th>\n",
       "      <td>-6.934852</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>('gpt2', 'p0.9')</th>\n",
       "      <td>-15.783290</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>('gpt2', 'p1.0')</th>\n",
       "      <td>-27.517603</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>('gpt2-medium', 'p1.0')</th>\n",
       "      <td>-30.768640</td>\n",
       "    </tr>\n",
       "  </tbody>\n",
       "</table>\n",
       "</div>"
      ],
      "text/plain": [
       "                         BT/Human-like\n",
       "human                        47.251038\n",
       "('gpt2-xl', 'p0.95')         15.663819\n",
       "('gpt2-large', 'p0.95')      12.552649\n",
       "('gpt2-xl', 'p1.0')           8.966299\n",
       "('gpt2-medium', 'p0.9')      -3.429421\n",
       "('gpt2-large', 'p1.0')       -6.934852\n",
       "('gpt2', 'p0.9')            -15.783290\n",
       "('gpt2', 'p1.0')            -27.517603\n",
       "('gpt2-medium', 'p1.0')     -30.768640"
      ]
     },
     "execution_count": 23,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "# Answer.q3: Answer of crowd-worker to the question: \n",
    "# \"Which continuation is more likely to be written by a human?\" \n",
    "\n",
    "h3 = get_head2head_and_BT_rank(field_name='Answer.q3', threshold_time=THRESHOLD_TIME)\n",
    "\n",
    "correlation = scipy.stats.spearmanr(h3.drop(\"human\").sort_index(), mauve_scores.sort_index())\n",
    "print(\"Correlation =\", correlation)\n",
    "h3.to_frame()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 24,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "Correlation = SpearmanrResult(correlation=-0.4285714285714286, pvalue=0.2894032248467901)\n"
     ]
    },
    {
     "name": "stderr",
     "output_type": "stream",
     "text": [
      "/Users/james/opt/anaconda3/envs/graphdl/lib/python3.7/site-packages/ipykernel_launcher.py:25: DeprecationWarning: `np.int` is a deprecated alias for the builtin `int`. To silence this warning, use `int` by itself. Doing this will not modify any behavior and is safe. When replacing `np.int`, you may wish to use e.g. `np.int64` or `np.int32` to specify the precision. If you wish to review your current use, check the release note link for additional information.\n",
      "Deprecated in NumPy 1.20; for more details and guidance: https://numpy.org/devdocs/release/1.20.0-notes.html#deprecations\n",
      "/Users/james/opt/anaconda3/envs/graphdl/lib/python3.7/site-packages/ipykernel_launcher.py:26: DeprecationWarning: `np.int` is a deprecated alias for the builtin `int`. To silence this warning, use `int` by itself. Doing this will not modify any behavior and is safe. When replacing `np.int`, you may wish to use e.g. `np.int64` or `np.int32` to specify the precision. If you wish to review your current use, check the release note link for additional information.\n",
      "Deprecated in NumPy 1.20; for more details and guidance: https://numpy.org/devdocs/release/1.20.0-notes.html#deprecations\n"
     ]
    },
    {
     "data": {
      "text/html": [
       "<div>\n",
       "<style scoped>\n",
       "    .dataframe tbody tr th:only-of-type {\n",
       "        vertical-align: middle;\n",
       "    }\n",
       "\n",
       "    .dataframe tbody tr th {\n",
       "        vertical-align: top;\n",
       "    }\n",
       "\n",
       "    .dataframe thead th {\n",
       "        text-align: right;\n",
       "    }\n",
       "</style>\n",
       "<table border=\"1\" class=\"dataframe\">\n",
       "  <thead>\n",
       "    <tr style=\"text-align: right;\">\n",
       "      <th></th>\n",
       "      <th>BT/Sensible</th>\n",
       "    </tr>\n",
       "  </thead>\n",
       "  <tbody>\n",
       "    <tr>\n",
       "      <th>human</th>\n",
       "      <td>43.229275</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>('gpt2-xl', 'p0.95')</th>\n",
       "      <td>31.887734</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>('gpt2-large', 'p0.95')</th>\n",
       "      <td>8.781433</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>('gpt2-xl', 'p1.0')</th>\n",
       "      <td>7.752505</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>('gpt2-large', 'p1.0')</th>\n",
       "      <td>-7.106110</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>('gpt2-medium', 'p0.9')</th>\n",
       "      <td>-7.293270</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>('gpt2', 'p0.9')</th>\n",
       "      <td>-7.441769</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>('gpt2-medium', 'p1.0')</th>\n",
       "      <td>-32.004313</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>('gpt2', 'p1.0')</th>\n",
       "      <td>-37.805484</td>\n",
       "    </tr>\n",
       "  </tbody>\n",
       "</table>\n",
       "</div>"
      ],
      "text/plain": [
       "                         BT/Sensible\n",
       "human                      43.229275\n",
       "('gpt2-xl', 'p0.95')       31.887734\n",
       "('gpt2-large', 'p0.95')     8.781433\n",
       "('gpt2-xl', 'p1.0')         7.752505\n",
       "('gpt2-large', 'p1.0')     -7.106110\n",
       "('gpt2-medium', 'p0.9')    -7.293270\n",
       "('gpt2', 'p0.9')           -7.441769\n",
       "('gpt2-medium', 'p1.0')   -32.004313\n",
       "('gpt2', 'p1.0')          -37.805484"
      ]
     },
     "execution_count": 24,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "# Answer.q2: Answer of crowd-worker to the question: \n",
    "# \"Which continuation makes more sense, given the context?\"\n",
    "\n",
    "h2 = get_head2head_and_BT_rank(field_name='Answer.q2', threshold_time=THRESHOLD_TIME)\n",
    "correlation = scipy.stats.spearmanr(h2.drop(\"human\").sort_index(), mauve_scores.sort_index())\n",
    "print(\"Correlation =\", correlation)\n",
    "h2.to_frame()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 25,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "Correlation = SpearmanrResult(correlation=-0.4285714285714286, pvalue=0.2894032248467901)\n"
     ]
    },
    {
     "name": "stderr",
     "output_type": "stream",
     "text": [
      "/Users/james/opt/anaconda3/envs/graphdl/lib/python3.7/site-packages/ipykernel_launcher.py:25: DeprecationWarning: `np.int` is a deprecated alias for the builtin `int`. To silence this warning, use `int` by itself. Doing this will not modify any behavior and is safe. When replacing `np.int`, you may wish to use e.g. `np.int64` or `np.int32` to specify the precision. If you wish to review your current use, check the release note link for additional information.\n",
      "Deprecated in NumPy 1.20; for more details and guidance: https://numpy.org/devdocs/release/1.20.0-notes.html#deprecations\n",
      "/Users/james/opt/anaconda3/envs/graphdl/lib/python3.7/site-packages/ipykernel_launcher.py:26: DeprecationWarning: `np.int` is a deprecated alias for the builtin `int`. To silence this warning, use `int` by itself. Doing this will not modify any behavior and is safe. When replacing `np.int`, you may wish to use e.g. `np.int64` or `np.int32` to specify the precision. If you wish to review your current use, check the release note link for additional information.\n",
      "Deprecated in NumPy 1.20; for more details and guidance: https://numpy.org/devdocs/release/1.20.0-notes.html#deprecations\n"
     ]
    },
    {
     "data": {
      "text/html": [
       "<div>\n",
       "<style scoped>\n",
       "    .dataframe tbody tr th:only-of-type {\n",
       "        vertical-align: middle;\n",
       "    }\n",
       "\n",
       "    .dataframe tbody tr th {\n",
       "        vertical-align: top;\n",
       "    }\n",
       "\n",
       "    .dataframe thead th {\n",
       "        text-align: right;\n",
       "    }\n",
       "</style>\n",
       "<table border=\"1\" class=\"dataframe\">\n",
       "  <thead>\n",
       "    <tr style=\"text-align: right;\">\n",
       "      <th></th>\n",
       "      <th>BT/Interesting</th>\n",
       "    </tr>\n",
       "  </thead>\n",
       "  <tbody>\n",
       "    <tr>\n",
       "      <th>human</th>\n",
       "      <td>25.503156</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>('gpt2-xl', 'p0.95')</th>\n",
       "      <td>23.045606</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>('gpt2-xl', 'p1.0')</th>\n",
       "      <td>9.529022</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>('gpt2-large', 'p0.95')</th>\n",
       "      <td>6.785066</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>('gpt2', 'p0.9')</th>\n",
       "      <td>-0.696617</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>('gpt2-large', 'p1.0')</th>\n",
       "      <td>-1.532425</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>('gpt2-medium', 'p0.9')</th>\n",
       "      <td>-12.823619</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>('gpt2', 'p1.0')</th>\n",
       "      <td>-15.487289</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>('gpt2-medium', 'p1.0')</th>\n",
       "      <td>-34.322899</td>\n",
       "    </tr>\n",
       "  </tbody>\n",
       "</table>\n",
       "</div>"
      ],
      "text/plain": [
       "                         BT/Interesting\n",
       "human                         25.503156\n",
       "('gpt2-xl', 'p0.95')          23.045606\n",
       "('gpt2-xl', 'p1.0')            9.529022\n",
       "('gpt2-large', 'p0.95')        6.785066\n",
       "('gpt2', 'p0.9')              -0.696617\n",
       "('gpt2-large', 'p1.0')        -1.532425\n",
       "('gpt2-medium', 'p0.9')      -12.823619\n",
       "('gpt2', 'p1.0')             -15.487289\n",
       "('gpt2-medium', 'p1.0')      -34.322899"
      ]
     },
     "execution_count": 25,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "# Answer.q1: Answer of crowd-worker to the question: \n",
    "# \"Which continuation is more interesting or creative, given the context?\"\n",
    "        \n",
    "h1 = get_head2head_and_BT_rank(field_name='Answer.q1', threshold_time=THRESHOLD_TIME)\n",
    "correlation = scipy.stats.spearmanr(h1.drop(\"human\").sort_index(), mauve_scores.sort_index())\n",
    "print(\"Correlation =\", correlation)\n",
    "h1.to_frame()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": []
  }
 ],
 "metadata": {
  "kernelspec": {
   "display_name": "Python 3",
   "language": "python",
   "name": "python3"
  },
  "language_info": {
   "codemirror_mode": {
    "name": "ipython",
    "version": 3
   },
   "file_extension": ".py",
   "mimetype": "text/x-python",
   "name": "python",
   "nbconvert_exporter": "python",
   "pygments_lexer": "ipython3",
   "version": "3.7.16"
  }
 },
 "nbformat": 4,
 "nbformat_minor": 5
}
