{
 "cells": [
  {
   "cell_type": "code",
   "execution_count": 25,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "'101'"
      ]
     },
     "execution_count": 25,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "T.decode([13736])"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 32,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "2"
      ]
     },
     "execution_count": 32,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "T.unk_id()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 33,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "'▁'"
      ]
     },
     "execution_count": 33,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "T.id_to_piece(3)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 28,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "'110'"
      ]
     },
     "execution_count": 28,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "T.decode([3,11190])"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 4,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "Collecting jsonlines\n",
      "  Using cached jsonlines-3.0.0-py3-none-any.whl (8.5 kB)\n",
      "Requirement already satisfied: attrs>=19.2.0 in /Users/felix/Documents/research/envs/real_t5x/lib/python3.8/site-packages (from jsonlines) (21.4.0)\n",
      "Installing collected packages: jsonlines\n",
      "Successfully installed jsonlines-3.0.0\n"
     ]
    }
   ],
   "source": [
    "!pip install jsonlines"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 6,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "The autoreload extension is already loaded. To reload it, use:\n",
      "  %reload_ext autoreload\n"
     ]
    },
    {
     "name": "stderr",
     "output_type": "stream",
     "text": [
      "/var/folders/xb/xhwwptys1xs5vn0fn_7c7_600000gn/T/ipykernel_9537/3995065402.py:4: DeprecationWarning: Importing display from IPython.core.display is deprecated since IPython 7.14, please import from IPython display\n",
      "  from IPython.core.display import display, HTML\n"
     ]
    },
    {
     "data": {
      "text/html": [
       "<style>.container { width:100% !important; }</style>"
      ],
      "text/plain": [
       "<IPython.core.display.HTML object>"
      ]
     },
     "metadata": {},
     "output_type": "display_data"
    }
   ],
   "source": [
    "%load_ext autoreload\n",
    "%reload_ext autoreload\n",
    "%autoreload 2\n",
    "from IPython.core.display import display, HTML\n",
    "display(HTML(\"<style>.container { width:100% !important; }</style>\"))\n",
    "import json\n",
    "import pandas as pd\n",
    "import numpy as np\n",
    "from tasks_with_tokens import *\n",
    "\n",
    "\n",
    "tids = tuple(T5_TOKEN_IDS_FOR_TASKS)\n",
    "def show_sample_examples(func, num_examples=10, length_range=(10, 20), token_ids=tids):\n",
    "    for _ in range(num_examples):\n",
    "        print(func(length_range=length_range, char_set=char_set))\n",
    "        \n",
    "UNARY_BASIC_TASKS = ['copy', 'reverse', 'set', 'first_char', 'last_char', 'deduplicate', 'length', \n",
    "       'longest_word', 'duplicate']\n",
    "\n",
    "# UNARY_BASIC_TASKS = ['length']\n",
    "T = TOKENIZER\n",
    "\n",
    "def show_task_examples(task, is_natural_language=False, num_examples=20, is_print=True):\n",
    "    for count in range(num_examples):\n",
    "        basic_task_sampling_args = {'length_range': (10,30), 'token_ids':tids[:200]}\n",
    "        example = generate_example(TASK_REGISTRY[task], is_natural_language=is_natural_language, **basic_task_sampling_args)\n",
    "        if count % 1000==0:\n",
    "            print(count)\n",
    "        if is_print:\n",
    "            print(\"-----\")\n",
    "            if task == 'longest_word':\n",
    "                print(f'@input = {example[0]}')\n",
    "                print(f'output = {example[1]}')\n",
    "            else:\n",
    "                print(example)\n",
    "#             print(f\"input token IDs {T.encode(example[0])}\")\n",
    "            print(f\"input decoded: {T.decode(eval(example[0]))}\")\n",
    "\n",
    "            print(f\"output decoded: {T.decode(eval(example[1]))}\")\n",
    "\n",
    "#             if len(example[1]) == 0 or example[1].isspace():\n",
    "#                 print(example[1])\n",
    "#                 raise RuntimeError()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 7,
   "metadata": {
    "scrolled": true
   },
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "\n",
      "copy\n",
      "0\n",
      "-----\n",
      "('[2405, 32099, 129, 165, 148, 13, 132, 67, 2, 191, 106, 120, 73, 94, 102, 174, 50, 90, 183, 143, 57, 105, 134, 80, 3, 96, 36, 19, 173, 161, 191, 188]', '[129, 165, 148, 13, 132, 67, 2, 191, 106, 120, 73, 94, 102, 174, 50, 90, 183, 143, 57, 105, 134, 80, 3, 96, 36, 19, 173, 161, 191, 188]')\n",
      "input decoded: copy <extra_id_0> get its You of there die ⁇  pentruonly un Itp need la le am make by “S one  \" be isil work pentruA\n",
      "output decoded: get its You of there die ⁇  pentruonly un Itp need la le am make by “S one  \" be isil work pentruA\n",
      "-----\n",
      "('[2405, 32099, 62, 103, 94, 139, 178, 120, 162, 53, 69, 140, 43, 168, 11, 12, 51, 26, 82, 8, 55, 73, 39, 142, 42]', '[62, 103, 94, 139, 178, 120, 162, 53, 69, 140, 43, 168, 11, 12, 51, 26, 82, 8, 55, 73, 39, 142, 42]')\n",
      "input decoded: copy <extra_id_0> we do It into uslyveing our me have well and tomd my the! un your se or\n",
      "output decoded: we do It into uslyveing our me have well and tomd my the! un your se or\n",
      "-----\n",
      "('[2405, 32099, 194, 92, 158, 89, 24, 141, 110, 184, 77, 118, 85, 166, 173]', '[194, 92, 158, 89, 24, 141, 110, 184, 77, 118, 85, 166, 173]')\n",
      "input decoded: copy <extra_id_0> way also pef that had les&in beenà firstil\n",
      "output decoded: way also pef that had les&in beenà firstil\n",
      "-----\n",
      "('[2405, 32099, 13, 57, 170, 59, 105, 99, 136, 13, 142, 28, 135, 69, 3, 167, 130, 41, 49, 65, 7, 134, 54, 89, 45]', '[13, 57, 170, 59, 105, 99, 136, 13, 142, 28, 135, 69, 3, 167, 130, 41, 49, 65, 7, 134, 54, 89, 45]')\n",
      "input decoded: copy <extra_id_0> of by zu not “if any of se with them our  most were (er hassS canf from\n",
      "output decoded: of by zu not “if any of se with them our  most were (er hassS canf from\n",
      "-----\n",
      "('[2405, 32099, 146, 172, 106, 61, 139, 94, 198, 166, 109, 10]', '[146, 172, 106, 61, 139, 94, 198, 166, 109, 10]')\n",
      "input decoded: copy <extra_id_0> duzon) into It și firstle:\n",
      "output decoded: duzon) into It și firstle:\n",
      "\n",
      "reverse\n",
      "0\n",
      "-----\n",
      "('[7211, 32099, 24, 199, 20, 138, 61, 23, 48, 67, 84, 63, 26, 149, 74, 19]', '[19, 74, 149, 26, 63, 84, 67, 48, 23, 61, 138, 20, 199, 24]')\n",
      "input decoded: reverse <extra_id_0> that help deal)i this die whichyd how der is\n",
      "output decoded: is der howdy which die thisi)al de help that\n",
      "-----\n",
      "('[7211, 32099, 22, 36, 177, 59, 154, 175, 149, 167, 150, 76, 23, 146, 117, 9, 78, 83]', '[83, 78, 9, 117, 146, 23, 76, 150, 167, 149, 175, 154, 59, 177, 36, 22]')\n",
      "input decoded: reverse <extra_id_0>’ be den noté these how most noui du;a soul\n",
      "output decoded: ul soa; duiu no most how theseé not den be’\n",
      "-----\n",
      "('[7211, 32099, 178, 140, 28, 124, 189, 9, 11, 12, 21, 87, 85, 14, 200, 105, 147, 164, 140, 163, 42, 98, 30, 80, 87, 145]', '[145, 87, 80, 30, 98, 42, 163, 140, 164, 147, 105, 200, 14, 85, 87, 21, 12, 11, 9, 189, 124, 28, 140, 178]')\n",
      "input decoded: reverse <extra_id_0> us me with caretha and to for/à fill best “ over may me only oră on one/ than\n",
      "output decoded: than/ one onă or only me may over “ best fillà/ for to andath care with me us\n",
      "-----\n",
      "('[7211, 32099, 200, 117, 142, 14, 117, 4, 49, 107, 25, 181, 50, 101, 146, 36, 8, 126, 59, 27, 117, 131, 80, 167, 26, 97, 57, 36]', '[36, 57, 97, 26, 167, 80, 131, 117, 27, 59, 126, 8, 36, 146, 101, 50, 181, 25, 107, 49, 4, 117, 14, 142, 117, 200]')\n",
      "input decoded: reverse <extra_id_0> best; se fill;Xerh you mit la We du be the new not I; just one mostd time by be\n",
      "output decoded: be by timed most one just; I not new the be du We la mit youherX; fill se; best\n",
      "-----\n",
      "('[7211, 32099, 200, 139, 71, 11, 150, 119, 9, 71, 102, 107, 45, 135, 85, 29, 68, 6, 173, 77]', '[77, 173, 6, 68, 29, 85, 135, 45, 107, 102, 71, 9, 119, 150, 11, 71, 139, 200]')\n",
      "input decoded: reverse <extra_id_0> best into A and no othera Aph from themàn but,ilin\n",
      "output decoded: inil, butnà them fromhp Aa other no and A into best\n",
      "\n",
      "set\n",
      "0\n",
      "-----\n",
      "('[356, 32099, 110, 107, 110, 107, 37, 37, 107, 107, 37, 37, 110, 110, 107, 110]', '[110, 107, 37]')\n",
      "input decoded: set <extra_id_0> lesh lesh The Thehh The The les lesh les\n",
      "output decoded: lesh The\n",
      "-----\n",
      "('[356, 32099, 47, 20, 200, 20, 94, 93, 5, 47, 93, 101, 51, 22, 20, 161, 20, 20, 51]', '[47, 20, 200, 94, 93, 5, 101, 51, 22, 161]')\n",
      "input decoded: set <extra_id_0> was de best de It des. was des Wem’ de work de dem\n",
      "output decoded: was de best It des. Wem’ work\n",
      "-----\n",
      "('[356, 32099, 169, 169, 127, 169, 23, 146, 49, 23, 127, 178, 76, 76, 120, 32, 32, 120, 76, 169, 170]', '[169, 127, 23, 146, 49, 178, 76, 120, 32, 170]')\n",
      "input decoded: set <extra_id_0> use useor usei duerior usuulyoolyu use zu\n",
      "output decoded: useori duer usulyo zu\n",
      "-----\n",
      "('[356, 32099, 121, 80, 45, 80, 80, 45, 121, 121, 45, 80, 45, 80, 45, 121, 121, 80, 80, 80, 121, 45, 80]', '[121, 80, 45]')\n",
      "input decoded: set <extra_id_0>\" one from one one from\"\" from one from one from\"\" one one one\" from one\n",
      "output decoded: \" one from\n",
      "-----\n",
      "('[356, 32099, 148, 122, 148, 192, 148, 134, 148, 122, 134, 148, 134, 161, 122, 161]', '[148, 122, 192, 134, 161]')\n",
      "input decoded: set <extra_id_0> Youg You two YouS YougS YouS workg work\n",
      "output decoded: Youg twoS work\n",
      "\n",
      "first_char\n",
      "0\n",
      "-----\n",
      "('[166, 834, 4059, 32099, 50, 88, 19, 89, 109, 12, 39, 29, 172, 153, 129, 150, 2, 107, 60, 97, 100, 16, 180, 25, 161, 45, 4, 166, 22, 194, 53]', '[50]')\n",
      "input decoded: first_char <extra_id_0> lahe isfle to yournz” get no ⁇ hre time This in S you work fromX first’ waying\n",
      "output decoded: la\n",
      "-----\n",
      "('[166, 834, 4059, 32099, 199, 102, 15, 157, 85, 139, 76, 101, 50, 110, 129, 123, 22, 17, 46, 75, 175, 106, 98, 54, 145, 125, 152]', '[199]')\n",
      "input decoded: first_char <extra_id_0> helppekà intou We la les get cu’t anc theseonă can than whatan\n",
      "output decoded: help\n",
      "-----\n",
      "('[166, 834, 4059, 32099, 191, 22, 114, 86, 117, 21, 130, 154, 47, 125, 85, 143, 97, 135, 165, 65, 72, 112, 21, 11, 156, 125, 103, 123]', '[191]')\n",
      "input decoded: first_char <extra_id_0> pentru’ like In; for wereé was whatà make time them its has more his for and If what do cu\n",
      "output decoded: pentru\n",
      "-----\n",
      "('[166, 834, 4059, 32099, 31, 158, 75, 181, 16, 37, 158, 90, 16, 24, 56, 135, 97, 30, 67, 103, 147, 49, 108]', '[31]')\n",
      "input decoded: first_char <extra_id_0>' pec mit in The pe le in that will them time on die do overer si\n",
      "output decoded: '\n",
      "-----\n",
      "('[166, 834, 4059, 32099, 95, 138, 84, 82, 33, 40, 17, 113, 46, 23, 105, 172, 54, 86, 72, 69, 106, 87, 166, 50, 188, 128, 98, 124, 111, 68, 21, 172, 190, 163]', '[95]')\n",
      "input decoded: first_char <extra_id_0> upal which my arelt who ani “z can In more ouron/ first laA someă care în but forz through only\n",
      "output decoded: up\n",
      "\n",
      "last_char\n",
      "0\n",
      "-----\n",
      "('[336, 834, 4059, 32099, 13, 49, 95, 72, 92, 58, 97, 125, 103, 12, 82, 124, 119, 95, 66, 135, 75, 33, 51]', '[51]')\n",
      "input decoded: last_char <extra_id_0> ofer up more also? time what do to my care other up all themc arem\n",
      "output decoded: m\n",
      "-----\n",
      "('[336, 834, 4059, 32099, 28, 85, 157, 80, 183, 97, 12, 17, 13, 144, 44, 188, 150, 173, 71, 29, 131, 118, 27, 161, 29, 160, 44, 98, 127, 46, 21]', '[21]')\n",
      "input decoded: last_char <extra_id_0> withàk one am time tot ofat atA noil An just been I workn her atăor an for\n",
      "output decoded: for\n",
      "-----\n",
      "('[336, 834, 4059, 32099, 44, 139, 124, 50, 116, 28, 54, 182, 103, 191, 122, 182, 42, 122, 103, 196, 59, 130, 81, 83, 125, 181, 60, 165, 103, 161, 171]', '[171]')\n",
      "input decoded: last_char <extra_id_0> at into care la when with can very do pentrug very org doI not were aboutul what mitre its do work pour\n",
      "output decoded: pour\n",
      "-----\n",
      "('[336, 834, 4059, 32099, 200, 173, 70, 71, 8, 198, 97, 144, 7, 185, 116, 58, 8, 72, 184, 15, 200, 167, 176, 162, 71, 119, 92, 95, 140, 192, 39, 18]', '[18]')\n",
      "input decoded: last_char <extra_id_0> bestil their A the și timeats au when? the more&e best most dinve A other also up me two your-\n",
      "output decoded: -\n",
      "-----\n",
      "('[336, 834, 4059, 32099, 8, 181, 190, 134, 98, 58, 42, 140, 81, 127, 55, 177, 20, 105, 45, 185, 197, 19, 184, 187, 75, 125, 89, 167]', '[167]')\n",
      "input decoded: last_char <extra_id_0> the mit throughSă? or me aboutor! den de “ from au ce is& maic whatf most\n",
      "output decoded: most\n",
      "\n",
      "deduplicate\n",
      "0\n",
      "-----\n",
      "('[20, 26, 413, 26221, 32099, 75, 59, 193, 193, 178, 105, 117, 135, 196, 101]', '[75, 59, 193, 178, 105, 117, 135, 196, 101]')\n",
      "input decoded: deduplicate <extra_id_0>c not von von us “; themI We\n",
      "output decoded: c not von us “; themI We\n",
      "-----\n",
      "('[20, 26, 413, 26221, 32099, 27, 68, 86, 86, 86, 86, 130, 75, 75, 154, 164, 164, 166, 86, 176, 176, 176, 180, 155, 117, 117]', '[27, 68, 86, 130, 75, 154, 164, 166, 86, 176, 180, 155, 117]')\n",
      "input decoded: deduplicate <extra_id_0> I but In In In In wereccé may may first In din din din Sit;;\n",
      "output decoded: I but In werecé may first In din Sit;\n",
      "-----\n",
      "('[20, 26, 413, 26221, 32099, 50, 197, 60, 194, 102, 102, 98, 188, 158, 142, 56, 120, 100, 15, 61, 102, 13, 16, 26, 40, 49, 29, 46, 108, 85, 85, 98, 80, 115, 71]', '[50, 197, 60, 194, 102, 98, 188, 158, 142, 56, 120, 100, 15, 61, 102, 13, 16, 26, 40, 49, 29, 46, 108, 85, 98, 80, 115, 71]')\n",
      "input decoded: deduplicate <extra_id_0> la cere wayppăA pe se willly Thise)p of indlern an siààă oneb A\n",
      "output decoded: la cere waypăA pe se willly Thise)p of indlern an siàă oneb A\n",
      "-----\n",
      "('[20, 26, 413, 26221, 32099, 61, 61, 61, 61, 61, 61, 61, 61, 61, 95, 95, 95, 95, 95, 95, 95, 34, 34, 34, 34, 34, 34, 34, 34, 34, 34]', '[61, 95, 34]')\n",
      "input decoded: deduplicate <extra_id_0>))))))))) up up up up up up up it it it it it it it it it it\n",
      "output decoded: ) up it\n",
      "-----\n",
      "('[20, 26, 413, 26221, 32099, 143, 143, 74, 74, 43, 124, 126, 111, 111, 111, 122, 61, 61, 64, 163, 181, 44, 65, 65, 50, 19, 19, 19, 136, 136, 155]', '[143, 74, 43, 124, 126, 111, 122, 61, 64, 163, 181, 44, 65, 50, 19, 136, 155]')\n",
      "input decoded: deduplicate <extra_id_0> make make der der have care new în în îng)) und only mit at has has la is is is any anyit\n",
      "output decoded: make der have care new îng) und only mit at has la is anyit\n",
      "\n",
      "length\n",
      "0\n",
      "-----\n",
      "('[2475, 32099, 31, 5, 155, 6, 189, 111, 171, 33, 96, 172, 144, 163, 181, 93, 190, 93]', '[898]')\n",
      "input decoded: length <extra_id_0>'.it,th în pour are \"zat only mit des through des\n",
      "output decoded: 16\n",
      "-----\n",
      "('[2475, 32099, 30, 76, 110, 165, 130, 137, 31, 82, 74, 199, 62, 182, 55, 167, 69, 91, 73]', '[1003]')\n",
      "input decoded: length <extra_id_0> onu les its were).' my der help we very! most our out un\n",
      "output decoded: 17\n",
      "-----\n",
      "('[2475, 32099, 21, 93, 89, 161, 128, 38, 7, 88, 29, 149, 107, 58, 69, 42, 52, 17]', '[898]')\n",
      "input decoded: length <extra_id_0> for desf work some asshen howh? our orrt\n",
      "output decoded: 16\n",
      "-----\n",
      "('[2475, 32099, 165, 189, 35, 167, 195, 14, 55, 163, 116, 184, 164, 129, 95, 160, 72, 63, 133, 34, 69, 151, 7, 120, 132, 146, 24, 54, 8, 64, 112]', '[2838]')\n",
      "input decoded: length <extra_id_0> itsthen mostll fill! only when& may get up her morey would it our peoplesly there du that can the und his\n",
      "output decoded: 29\n",
      "-----\n",
      "('[2475, 32099, 27, 187, 90, 175, 64, 46, 26, 183, 143, 167, 4, 150, 72, 164, 46, 157, 20]', '[1003]')\n",
      "input decoded: length <extra_id_0> I mai le these und and am make mostX no more may ank de\n",
      "output decoded: 17\n",
      "\n",
      "longest_word\n",
      "0\n",
      "-----\n",
      "@input = [14783, 834, 6051, 32099, 22, 165, 128, 32098, 3, 148, 157, 32098, 187, 123, 32098, 90, 141, 32098, 120, 27, 18, 104, 32098, 53, 142, 32098, 63, 143, 42, 32098, 38, 32098, 8, 29]\n",
      "output = [120, 27, 18, 104]\n",
      "input decoded: longest_word <extra_id_0>’ its some <extra_id_1>  Youk <extra_id_1> mai cu <extra_id_1> le had <extra_id_1>ly I-– <extra_id_1>ing se <extra_id_1>y make or <extra_id_1> as <extra_id_1> then\n",
      "output decoded: ly I-–\n",
      "-----\n",
      "@input = [14783, 834, 6051, 32099, 108, 123, 60, 77, 32098, 13, 167, 32098, 23, 189, 114, 28, 32098, 57, 192, 32098, 121, 53, 82, 129, 32098, 71, 68, 148, 32098, 185, 197]\n",
      "output = [108, 123, 60, 77]\n",
      "input decoded: longest_word <extra_id_0> si curein <extra_id_1> of most <extra_id_1>ith like with <extra_id_1> by two <extra_id_1>\"ing my get <extra_id_1> A but You <extra_id_1> au ce\n",
      "output decoded: si curein\n",
      "-----\n",
      "@input = [14783, 834, 6051, 32099, 79, 19, 100, 80, 156, 143, 64, 149, 120, 85, 123, 121, 12, 189, 141, 130, 91, 18, 133, 148, 123, 193, 60, 45, 121]\n",
      "output = [79, 19, 100, 80, 156, 143, 64, 149, 120, 85, 123, 121, 12, 189, 141, 130, 91, 18, 133, 148, 123, 193, 60, 45, 121]\n",
      "input decoded: longest_word <extra_id_0> they is This one If make und howlyà cu\" toth had were out- would You cu vonre from\"\n",
      "output decoded: they is This one If make und howlyà cu\" toth had were out- would You cu vonre from\"\n",
      "-----\n",
      "@input = [14783, 834, 6051, 32099, 137, 163, 32098, 97, 32098, 154, 71, 32098, 69, 68, 32098, 146, 41, 32098, 115, 200, 32098, 143, 105, 120, 32098, 54, 134, 32098, 75]\n",
      "output = [143, 105, 120]\n",
      "input decoded: longest_word <extra_id_0>). only <extra_id_1> time <extra_id_1>é A <extra_id_1> our but <extra_id_1> du ( <extra_id_1>b best <extra_id_1> make “ly <extra_id_1> canS <extra_id_1>c\n",
      "output decoded: make “ly\n",
      "-----\n",
      "@input = [14783, 834, 6051, 32099, 102, 32098, 71, 32098, 176, 32098, 144, 32098, 164, 23, 34, 32098, 122, 32098, 59]\n",
      "output = [164, 23, 34]\n",
      "input decoded: longest_word <extra_id_0>p <extra_id_1> A <extra_id_1> din <extra_id_1>at <extra_id_1> mayi it <extra_id_1>g <extra_id_1> not\n",
      "output decoded: mayi it\n",
      "\n",
      "duplicate\n",
      "0\n",
      "-----\n",
      "('[19197, 32099, 17, 199, 103, 154, 190, 55, 197, 9, 65, 3, 159, 34, 47, 126, 88, 146, 142, 178, 56, 201]', '[17, 17, 199, 199, 103, 103, 154, 154, 190, 190, 55, 55, 197, 197, 9, 9, 65, 65, 3, 3, 159, 159, 34, 34, 47, 47, 126, 126, 88, 88, 146, 146, 142, 142, 178, 178, 56, 56, 201, 201]')\n",
      "input decoded: duplicate <extra_id_0>t help doé through! cea has is it was newhe du se us will),\n",
      "output decoded: tt help help do doéé through through!! ce ceaa has has  isis it it was was new newhehe du du se se us us will will),),\n",
      "-----\n",
      "('[19197, 32099, 163, 17, 96, 201, 93, 152, 131, 68, 16, 83, 188, 49, 159, 87, 38, 170]', '[163, 163, 17, 17, 96, 96, 201, 201, 93, 93, 152, 152, 131, 131, 68, 68, 16, 16, 83, 83, 188, 188, 49, 49, 159, 159, 87, 87, 38, 38, 170, 170]')\n",
      "input decoded: duplicate <extra_id_0> onlyt \"), desan just but inulAeris/ as zu\n",
      "output decoded: only onlytt \" \"),), des desanan just just but but in inululAAererisis// as as zu zu\n",
      "-----\n",
      "('[19197, 32099, 97, 198, 63, 152, 157, 79, 101, 70, 188, 114, 146, 28, 32]', '[97, 97, 198, 198, 63, 63, 152, 152, 157, 157, 79, 79, 101, 101, 70, 70, 188, 188, 114, 114, 146, 146, 28, 28, 32, 32]')\n",
      "input decoded: duplicate <extra_id_0> time șiyank they We theirA like du witho\n",
      "output decoded: time time și șiyyanankk they they We We their theirAA like like du du with withoo\n",
      "-----\n",
      "('[19197, 32099, 92, 189, 83, 84, 201, 94, 146, 123, 30, 104]', '[92, 92, 189, 189, 83, 83, 84, 84, 201, 201, 94, 94, 146, 146, 123, 123, 30, 30, 104, 104]')\n",
      "input decoded: duplicate <extra_id_0> alsothul which), It du cu on–\n",
      "output decoded: also alsoththulul which which),), It It du du cu cu on on––\n",
      "-----\n",
      "('[19197, 32099, 166, 146, 101, 56, 159, 111, 179, 150, 21, 56, 178, 114, 110, 40, 6]', '[166, 166, 146, 146, 101, 101, 56, 56, 159, 159, 111, 111, 179, 179, 150, 150, 21, 21, 56, 56, 178, 178, 114, 114, 110, 110, 40, 40, 6, 6]')\n",
      "input decoded: duplicate <extra_id_0> first du We willis înable no for will us like lesl,\n",
      "output decoded: first first du du We We will willisis în înableable no no for for will will us us like like les lesll,,\n"
     ]
    }
   ],
   "source": [
    "for task in UNARY_BASIC_TASKS:\n",
    "    print(f'\\n{task}')\n",
    "    show_task_examples(task, num_examples=5, is_print=True)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 8,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "\n",
      "count\n",
      "0\n",
      "-----\n",
      "('[3476, 32099, 21, 66, 72, 72, 126, 4, 72, 72, 72, 32097, 72]', '[305]')\n",
      "input decoded: count <extra_id_0> for all more more newX more more more <extra_id_2> more\n",
      "output decoded: 5\n",
      "-----\n",
      "('[3476, 32099, 199, 145, 145, 145, 145, 145, 145, 145, 186, 145, 99, 3, 145, 145, 145, 156, 145, 145, 145, 32097, 145]', '[968]')\n",
      "input decoded: count <extra_id_0> help than than than than than than than many thanif  than than than If than than than <extra_id_2> than\n",
      "output decoded: 14\n",
      "-----\n",
      "('[3476, 32099, 3, 191, 191, 191, 19, 183, 77, 54, 179, 195, 174, 191, 54, 191, 191, 191, 180, 191, 116, 191, 164, 191, 133, 191, 147, 32097, 191]', '[850]')\n",
      "input decoded: count <extra_id_0>  pentru pentru pentru is amin canablell need pentru can pentru pentru pentru S pentru when pentru may pentru would pentru over <extra_id_2> pentru\n",
      "output decoded: 11\n",
      "-----\n",
      "('[3476, 32099, 47, 47, 47, 47, 47, 47, 47, 47, 145, 47, 47, 32097, 47]', '[335]')\n",
      "input decoded: count <extra_id_0> was was was was was was was was than was was <extra_id_2> was\n",
      "output decoded: 10\n",
      "-----\n",
      "('[3476, 32099, 167, 167, 7, 42, 167, 167, 121, 167, 167, 113, 167, 167, 138, 167, 167, 9, 167, 167, 167, 167, 32097, 167]', '[968]')\n",
      "input decoded: count <extra_id_0> most mosts or most most\" most most who most mostal most mosta most most most most <extra_id_2> most\n",
      "output decoded: 14\n",
      "\n",
      "delete\n",
      "0\n",
      "-----\n",
      "('[9268, 32099, 147, 197, 112, 126, 57, 5, 130, 109, 107, 46, 181, 10, 100, 35, 2, 140, 12, 141, 32097, 57, 5, 130, 109, 107, 46, 181, 10, 100]', '[147, 197, 112, 126, 35, 2, 140, 12, 141]')\n",
      "input decoded: delete <extra_id_0> over ce his new by. wereleh an mit: Thisen ⁇  me to had <extra_id_2> by. wereleh an mit: This\n",
      "output decoded: over ce his newen ⁇  me to had\n",
      "-----\n",
      "('[9268, 32099, 30, 69, 31, 176, 92, 17, 138, 165, 123, 5, 86, 145, 104, 46, 192, 76, 4, 54, 182, 32097, 138, 165, 123, 5, 86, 145, 104, 46, 192]', '[30, 69, 31, 176, 92, 17, 76, 4, 54, 182]')\n",
      "input decoded: delete <extra_id_0> on our' din alsotal its cu. In than– an twouX can very <extra_id_2>al its cu. In than– an two\n",
      "output decoded: on our' din alsotuX can very\n",
      "-----\n",
      "('[9268, 32099, 144, 14, 120, 108, 155, 134, 93, 21, 187, 115, 108, 155, 134, 93, 21, 103, 173, 32097, 108, 155, 134, 93, 21]', '[144, 14, 120, 187, 115, 108, 155, 134, 93, 21, 103, 173]')\n",
      "input decoded: delete <extra_id_0>at fillly siitS des for maib siitS des for doil <extra_id_2> siitS des for\n",
      "output decoded: at fillly maib siitS des for doil\n",
      "-----\n",
      "('[9268, 32099, 77, 118, 5, 170, 165, 155, 67, 147, 143, 90, 158, 32097, 58, 6, 144, 137, 112, 159, 110, 78, 20, 14, 140]', '[77, 118, 5, 170, 165, 155, 67, 147, 143, 90, 158]')\n",
      "input decoded: delete <extra_id_0>in been. zu itsit die over make le pe <extra_id_2>?,at). hisis les so de fill me\n",
      "output decoded: in been. zu itsit die over make le pe\n",
      "-----\n",
      "('[9268, 32099, 93, 23, 39, 93, 23, 39, 93, 23, 39, 133, 93, 23, 39, 127, 150, 93, 23, 39, 197, 90, 93, 23, 39, 151, 89, 74, 32097, 93, 23, 39]', '[93, 23, 39, 93, 23, 39, 133, 93, 23, 39, 127, 150, 93, 23, 39, 197, 90, 93, 23, 39, 151, 89, 74]')\n",
      "input decoded: delete <extra_id_0> desi your desi your desi your would desi youror no desi your ce le desi your peoplef der <extra_id_2> desi your\n",
      "output decoded: desi your desi your would desi youror no desi your ce le desi your peoplef der\n",
      "\n",
      "filter\n",
      "0\n",
      "-----\n",
      "('[4191, 32099, 76, 59, 59, 153, 59, 97, 47, 197, 59, 198, 152, 198, 59, 136, 59, 59, 106, 59, 104, 32097, 59]', '[76, 153, 97, 47, 197, 198, 152, 198, 136, 106, 104]')\n",
      "input decoded: filter <extra_id_0>u not not” not time was ce not șian și not any not noton not– <extra_id_2> not\n",
      "output decoded: u” time was ce șian și anyon–\n",
      "-----\n",
      "('[4191, 32099, 71, 117, 173, 180, 98, 133, 4, 49, 92, 173, 180, 98, 133, 4, 49, 92, 59, 32097, 173, 180, 98, 133, 4, 49, 92]', '[71, 117, 59]')\n",
      "input decoded: filter <extra_id_0> A;il Să wouldXer alsoil Să wouldXer also not <extra_id_2>il Să wouldXer also\n",
      "output decoded: A; not\n",
      "-----\n",
      "('[4191, 32099, 36, 192, 197, 44, 60, 192, 105, 73, 116, 33, 44, 60, 192, 105, 133, 34, 108, 108, 27, 32097, 60, 192, 105]', '[36, 192, 197, 44, 73, 116, 33, 44, 133, 34, 108, 108, 27]')\n",
      "input decoded: filter <extra_id_0> be two ce atre two “ un when are atre two “ would it si si I <extra_id_2>re two “\n",
      "output decoded: be two ce at un when are at would it si si I\n",
      "-----\n",
      "('[4191, 32099, 162, 148, 99, 103, 52, 56, 123, 8, 52, 73, 112, 153, 129, 152, 165, 140, 123, 8, 52, 189, 124, 33, 162, 40, 76, 32097, 123, 8, 52]', '[162, 148, 99, 103, 52, 56, 73, 112, 153, 129, 152, 165, 140, 189, 124, 33, 162, 40, 76]')\n",
      "input decoded: filter <extra_id_0>ve Youif dor will cu ther un his” getan its me cu therth care arevelu <extra_id_2> cu ther\n",
      "output decoded: ve Youif dor will un his” getan its meth care arevelu\n",
      "-----\n",
      "('[4191, 32099, 28, 28, 28, 71, 28, 28, 28, 28, 28, 28, 28, 28, 28, 28, 28, 28, 28, 28, 28, 28, 28, 28, 32097, 28]', '[71]')\n",
      "input decoded: filter <extra_id_0> with with with A with with with with with with with with with with with with with with with with with with <extra_id_2> with\n",
      "output decoded: A\n",
      "\n",
      "get_index\n",
      "0\n",
      "-----\n",
      "('[129, 834, 18288, 32099, 185, 163, 14, 9, 124, 55, 84, 147, 180, 8, 85, 73, 115, 30, 82, 117, 48, 85, 32097, 185, 180, 187, 129, 160, 57, 15, 190, 134, 177]', '[2292]')\n",
      "input decoded: get_index <extra_id_0> au only filla care! which over S theà unb on my; thisà <extra_id_2> au S mai get her bye throughS den\n",
      "output decoded: -1\n",
      "-----\n",
      "('[129, 834, 18288, 32099, 57, 157, 2, 39, 32, 82, 157, 2, 39, 32, 149, 200, 157, 2, 39, 32, 32097, 157, 2, 39, 32]', '[209]')\n",
      "input decoded: get_index <extra_id_0> byk ⁇  youro myk ⁇  youro how bestk ⁇  youro <extra_id_2>k ⁇  youro\n",
      "output decoded: 1\n",
      "-----\n",
      "('[129, 834, 18288, 32099, 185, 108, 141, 142, 106, 86, 103, 49, 11, 43, 201, 32097, 180, 17, 108, 111, 15, 52, 102, 187, 124, 57]', '[2292]')\n",
      "input decoded: get_index <extra_id_0> au si had seon In doer and have), <extra_id_2> St si înerp mai care by\n",
      "output decoded: -1\n",
      "-----\n",
      "('[129, 834, 18288, 32099, 105, 99, 145, 179, 26, 194, 127, 56, 177, 160, 116, 150, 138, 123, 134, 165, 163, 80, 32097, 99, 190, 173, 34, 47, 80, 107, 124, 152, 140, 130]', '[2292]')\n",
      "input decoded: get_index <extra_id_0> “if thanabled wayor will den her when noal cuS its only one <extra_id_2>if throughil it was oneh carean me were\n",
      "output decoded: -1\n",
      "-----\n",
      "('[129, 834, 18288, 32099, 102, 46, 102, 46, 102, 46, 102, 46, 32097, 102, 46]', '[632]')\n",
      "input decoded: get_index <extra_id_0>p anp anp anp an <extra_id_2>p an\n",
      "output decoded: 0\n",
      "\n",
      "search\n",
      "0\n",
      "-----\n",
      "('[960, 32099, 21, 153, 38, 43, 41, 13, 43, 181, 22, 87, 24, 128, 149, 86, 14, 32097, 41, 13, 43, 181]', '[4273]')\n",
      "input decoded: search <extra_id_0> for” as have ( of have mit’/ that some how In fill <extra_id_2> ( of have mit\n",
      "output decoded: yes\n",
      "-----\n",
      "('[960, 32099, 73, 134, 52, 126, 187, 99, 107, 196, 27, 126, 187, 99, 107, 196, 132, 32097, 126, 187, 99, 107, 196]', '[4273]')\n",
      "input decoded: search <extra_id_0> unSr new maiifhI I new maiifhI there <extra_id_2> new maiifhI\n",
      "output decoded: yes\n",
      "-----\n",
      "('[960, 32099, 159, 134, 86, 79, 181, 38, 197, 28, 88, 32, 60, 160, 25, 68, 64, 17, 150, 17, 140, 32097, 88, 32, 60, 160, 25]', '[4273]')\n",
      "input decoded: search <extra_id_0>isS In they mit as ce withheore her you but undt not me <extra_id_2>heore her you\n",
      "output decoded: yes\n",
      "-----\n",
      "('[960, 32099, 57, 151, 95, 141, 31, 188, 118, 70, 45, 124, 21, 188, 118, 70, 45, 124, 98, 32097, 188, 118, 70, 45]', '[4273]')\n",
      "input decoded: search <extra_id_0> by people up had'A been their from care forA been their from careă <extra_id_2>A been their from\n",
      "output decoded: yes\n",
      "-----\n",
      "('[960, 32099, 108, 79, 36, 68, 21, 192, 79, 36, 68, 21, 148, 12, 160, 32097, 79, 36, 68, 21]', '[4273]')\n",
      "input decoded: search <extra_id_0> si they be but for two they be but for You to her <extra_id_2> they be but for\n",
      "output decoded: yes\n",
      "\n",
      "sort\n",
      "0\n",
      "-----\n",
      "('[1843, 32099, 28, 28, 50, 28, 28, 50, 50, 50, 177, 162, 28, 50, 50, 50, 177, 188, 28, 50, 32097, 72, 62, 10, 50, 177, 188, 28, 50, 162]', '[177, 177, 188, 28, 28, 28, 28, 28, 28, 50, 50, 50, 50, 50, 50, 50, 50, 162]')\n",
      "input decoded: sort <extra_id_0> with with la with with la la la denve with la la la denA with la <extra_id_2> more we: la denA with lave\n",
      "output decoded: den denA with with with with with with la la la la la la la lave\n",
      "-----\n",
      "('[1843, 32099, 111, 111, 194, 194, 111, 152, 46, 194, 111, 46, 46, 111, 111, 152, 111, 46, 152, 152, 111, 32097, 152, 111, 194, 187, 38, 46]', '[152, 152, 152, 152, 111, 111, 111, 111, 111, 111, 111, 111, 194, 194, 194, 46, 46, 46, 46]')\n",
      "input decoded: sort <extra_id_0> în în way way înan an way în an an în înan în ananan în <extra_id_2>an în way mai as an\n",
      "output decoded: anananan în în în în în în în în way way way an an an an\n",
      "-----\n",
      "('[1843, 32099, 107, 121, 121, 121, 121, 107, 165, 13, 13, 165, 165, 165, 13, 121, 165, 107, 121, 13, 32097, 121, 63, 13, 82, 107, 59, 165]', '[121, 121, 121, 121, 121, 121, 13, 13, 13, 13, 107, 107, 107, 165, 165, 165, 165, 165]')\n",
      "input decoded: sort <extra_id_0>h\"\"\"\"h its of of its its its of\" itsh\" of <extra_id_2>\"y of myh not its\n",
      "output decoded: \"\"\"\"\"\" of of of ofhhh its its its its its\n",
      "-----\n",
      "('[1843, 32099, 192, 190, 133, 144, 133, 27, 162, 186, 94, 190, 192, 144, 192, 32097, 28, 192, 190, 154, 27, 133, 162, 144, 60, 110, 186, 50, 94]', '[192, 192, 192, 190, 190, 27, 133, 133, 162, 144, 144, 186, 94]')\n",
      "input decoded: sort <extra_id_0> two through wouldat would Ive many It through twoat two <extra_id_2> with two throughé I wouldveatre les many la It\n",
      "output decoded: two two two through through I would wouldveatat many It\n",
      "-----\n",
      "('[1843, 32099, 89, 179, 157, 179, 188, 33, 179, 92, 71, 153, 64, 82, 64, 182, 32097, 82, 66, 179, 71, 188, 157, 89, 182, 33, 64, 95, 153, 92, 179]', '[82, 71, 188, 157, 89, 182, 33, 64, 64, 153, 92, 179, 179, 179]')\n",
      "input decoded: sort <extra_id_0>fablekableA areable also A” und my und very <extra_id_2> my allable AAkf very are und up” alsoable\n",
      "output decoded: my AAkf very are und und” alsoableableable\n",
      "\n",
      "replace\n",
      "0\n",
      "-----\n",
      "('[3601, 32099, 127, 165, 75, 93, 175, 144, 201, 57, 32097, 144, 72]', '[127, 165, 75, 93, 175, 72, 201, 57]')\n",
      "input decoded: replace <extra_id_0>or itsc des theseat), by <extra_id_2>at more\n",
      "output decoded: or itsc des these more), by\n",
      "-----\n",
      "('[3601, 32099, 6, 85, 91, 144, 199, 25, 36, 46, 30, 172, 58, 184, 14, 49, 147, 59, 124, 147, 23, 145, 51, 32097, 85, 90]', '[6, 90, 91, 144, 199, 25, 36, 46, 30, 172, 58, 184, 14, 49, 147, 59, 124, 147, 23, 145, 51]')\n",
      "input decoded: replace <extra_id_0>,à outat help you be an onz?& filler over not care overi thanm <extra_id_2>à le\n",
      "output decoded: , le outat help you be an onz?& filler over not care overi thanm\n",
      "-----\n",
      "('[3601, 32099, 133, 118, 86, 14, 97, 84, 17, 31, 135, 192, 130, 146, 93, 42, 166, 179, 87, 128, 17, 32097, 31, 66]', '[133, 118, 86, 14, 97, 84, 17, 66, 135, 192, 130, 146, 93, 42, 166, 179, 87, 128, 17]')\n",
      "input decoded: replace <extra_id_0> would been In fill time whicht' them two were du des or firstable/ somet <extra_id_2>' all\n",
      "output decoded: would been In fill time whicht all them two were du des or firstable/ somet\n",
      "-----\n",
      "('[3601, 32099, 183, 120, 166, 51, 192, 189, 19, 109, 49, 133, 75, 2, 30, 164, 170, 64, 200, 28, 172, 70, 61, 64, 183, 140, 32097, 2, 105]', '[183, 120, 166, 51, 192, 189, 19, 109, 49, 133, 75, 105, 30, 164, 170, 64, 200, 28, 172, 70, 61, 64, 183, 140]')\n",
      "input decoded: replace <extra_id_0> amly firstm twoth isleer wouldc ⁇  on may zu und best withz their) und am me <extra_id_2> ⁇  “\n",
      "output decoded: amly firstm twoth isleer wouldc “ on may zu und best withz their) und am me\n",
      "-----\n",
      "('[3601, 32099, 184, 130, 37, 147, 28, 149, 163, 96, 32, 111, 128, 27, 59, 62, 153, 104, 32097, 104, 27]', '[184, 130, 37, 147, 28, 149, 163, 96, 32, 111, 128, 27, 59, 62, 153, 27]')\n",
      "input decoded: replace <extra_id_0>& were The over with how only \"o în some I not we”– <extra_id_2>– I\n",
      "output decoded: & were The over with how only \"o în some I not we” I\n",
      "\n",
      "replace_many\n",
      "0\n",
      "-----\n",
      "('[3601, 834, 348, 63, 32099, 61, 180, 92, 154, 183, 39, 83, 143, 192, 183, 141, 151, 32097, 192, 2, 154, 30, 151, 160, 183, 104, 83, 15, 92, 153, 141, 11]', '[61, 180, 153, 30, 104, 39, 15, 143, 2, 104, 11, 160]')\n",
      "input decoded: replace_many <extra_id_0>) S alsoé am yourul make two am had people <extra_id_2> two ⁇ é on people her am–ule also” had and\n",
      "output decoded: ) S” on– youre make ⁇ – and her\n",
      "-----\n",
      "('[3601, 834, 348, 63, 32099, 82, 108, 44, 27, 23, 191, 60, 137, 149, 188, 103, 46, 82, 153, 32097, 103, 64, 149, 168]', '[82, 108, 44, 27, 23, 191, 60, 137, 168, 188, 64, 46, 82, 153]')\n",
      "input decoded: replace_many <extra_id_0> my si at Ii pentrure). howA do an my” <extra_id_2> do und how well\n",
      "output decoded: my si at Ii pentrure). wellA und an my”\n",
      "-----\n",
      "('[3601, 834, 348, 63, 32099, 141, 67, 31, 35, 36, 191, 9, 159, 164, 70, 49, 30, 64, 184, 34, 9, 23, 32097, 164, 26, 49, 99, 35, 33, 9, 84, 67, 192, 184, 68]', '[141, 192, 31, 33, 36, 191, 84, 159, 26, 70, 99, 30, 64, 68, 34, 84, 23]')\n",
      "input decoded: replace_many <extra_id_0> had die'en be pentruais may theirer on und& itai <extra_id_2> mayderifen area which die two& but\n",
      "output decoded: had two' are be pentru whichisd theirif on und but it whichi\n",
      "-----\n",
      "('[3601, 834, 348, 63, 32099, 179, 26, 6, 76, 12, 110, 144, 171, 179, 181, 88, 38, 199, 79, 11, 116, 32097, 144, 39, 76, 129, 88, 75]', '[179, 26, 6, 129, 12, 110, 39, 171, 179, 181, 75, 38, 199, 79, 11, 116]')\n",
      "input decoded: replace_many <extra_id_0>abled,u to lesat pourable mithe as help they and when <extra_id_2>at youru gethec\n",
      "output decoded: abled, get to les your pourable mitc as help they and when\n",
      "-----\n",
      "('[3601, 834, 348, 63, 32099, 101, 109, 164, 53, 66, 187, 176, 29, 197, 193, 50, 47, 74, 30, 123, 63, 166, 163, 32097, 176, 18, 163, 136, 197, 99]', '[101, 109, 164, 53, 66, 187, 18, 29, 99, 193, 50, 47, 74, 30, 123, 63, 166, 136]')\n",
      "input decoded: replace_many <extra_id_0> Wele maying all mai dinn ce von la was der on cuy first only <extra_id_2> din- only any ceif\n",
      "output decoded: Wele maying all mai-nif von la was der on cuy first any\n",
      "\n",
      "union\n",
      "0\n",
      "-----\n",
      "('[7021, 32099, 149, 111, 153, 130, 153, 130, 197, 83, 44, 130, 130, 198, 153, 50, 50, 78, 153, 50, 50, 78, 32097, 197, 111, 111]', '[130, 197, 198, 44, 78, 111, 50, 83, 149, 153]')\n",
      "input decoded: union <extra_id_0> how în” were” were ceul at were were și” la la so” la la so <extra_id_2> ce în în\n",
      "output decoded: were ce și at so în laul how”\n",
      "-----\n",
      "('[7021, 32099, 61, 61, 99, 36, 32097, 14, 99, 14, 99, 99, 36]', '[99, 36, 61, 14]')\n",
      "input decoded: union <extra_id_0>))if be <extra_id_2> fillif fillifif be\n",
      "output decoded: if be) fill\n",
      "-----\n",
      "('[7021, 32099, 77, 141, 174, 141, 32097, 90, 141, 174, 196, 77, 174, 160, 160, 160, 174, 141]', '[160, 196, 77, 174, 141, 90]')\n",
      "input decoded: union <extra_id_0>in had need had <extra_id_2> le had needIin need her her her need had\n",
      "output decoded: herIin need had le\n",
      "-----\n",
      "('[7021, 32099, 113, 102, 102, 113, 189, 11, 135, 107, 102, 189, 32097, 24, 24, 24, 11, 189, 107]', '[102, 135, 107, 11, 113, 24, 189]')\n",
      "input decoded: union <extra_id_0> whopp whoth and themhpth <extra_id_2> that that that andthh\n",
      "output decoded: p themh and who thatth\n",
      "-----\n",
      "('[7021, 32099, 73, 121, 90, 191, 9, 90, 191, 9, 32097, 191, 121, 191, 90, 191, 73, 121]', '[73, 9, 121, 90, 191]')\n",
      "input decoded: union <extra_id_0> un\" le pentrua le pentrua <extra_id_2> pentru\" pentru le pentru un\"\n",
      "output decoded: una\" le pentru\n",
      "\n",
      "intersect\n",
      "0\n",
      "-----\n",
      "('[27806, 32099, 14, 167, 156, 14, 36, 36, 144, 15, 156, 86, 14, 167, 14, 144, 32097, 156, 144, 95, 14, 82, 161, 86]', '[144, 86, 156, 14]')\n",
      "input decoded: intersect <extra_id_0> fill most If fill be beate If In fill most fillat <extra_id_2> Ifat up fill my work In\n",
      "output decoded: at In If fill\n",
      "-----\n",
      "('[27806, 32099, 31, 17, 17, 32097, 77, 31, 17, 31, 31, 31]', '[17, 31]')\n",
      "input decoded: intersect <extra_id_0>'tt <extra_id_2>in't'''\n",
      "output decoded: t'\n",
      "-----\n",
      "('[27806, 32099, 171, 148, 77, 189, 114, 157, 38, 172, 148, 22, 77, 153, 32097, 41, 77, 77, 153, 77, 22, 172, 171, 22, 172, 189, 77, 114, 171, 171, 77]', '[171, 172, 77, 114, 22, 153, 189]')\n",
      "input decoded: intersect <extra_id_0> pour Youinth likek asz You’in” <extra_id_2> (inin”in’z pour’zthin like pour pourin\n",
      "output decoded: pourzin like’”th\n",
      "-----\n",
      "('[27806, 32099, 140, 94, 115, 94, 140, 140, 32097, 193, 193, 142, 193]', '[32096]')\n",
      "input decoded: intersect <extra_id_0> me Itb It me me <extra_id_2> von von se von\n",
      "output decoded: <extra_id_3>\n",
      "-----\n",
      "('[27806, 32099, 19, 152, 152, 32097, 170, 193, 137, 193, 76, 193, 19, 152, 19, 76, 121]', '[152, 19]')\n",
      "input decoded: intersect <extra_id_0> isanan <extra_id_2> zu von). vonu von isan isu\"\n",
      "output decoded: an is\n",
      "\n",
      "set_1_minus_2\n",
      "0\n",
      "-----\n",
      "('[356, 834, 536, 834, 14078, 834, 357, 32099, 32097, 106, 106, 144, 70, 70, 114, 106, 106, 144]', '[32096]')\n",
      "input decoded: set_1_minus_2 <extra_id_0> <extra_id_2>ononat their their likeononat\n",
      "output decoded: <extra_id_3>\n",
      "-----\n",
      "('[356, 834, 536, 834, 14078, 834, 357, 32099, 164, 164, 77, 77, 32097, 162, 64, 162, 162, 164, 77, 73, 73, 164, 73, 64]', '[32096]')\n",
      "input decoded: set_1_minus_2 <extra_id_0> may mayinin <extra_id_2>ve undveve mayin un un may un und\n",
      "output decoded: <extra_id_3>\n",
      "-----\n",
      "('[356, 834, 536, 834, 14078, 834, 357, 32099, 125, 199, 125, 63, 69, 69, 179, 199, 27, 109, 27, 27, 27, 125, 179, 15, 59, 69, 59, 189, 58, 58, 58, 138, 53, 32097, 59, 199, 58]', '[69, 138, 109, 15, 179, 53, 125, 27, 189, 63]')\n",
      "input decoded: set_1_minus_2 <extra_id_0> what help whaty our ourable help Ile I I I whatablee not our notth???aling <extra_id_2> not help?\n",
      "output decoded: ouralleeableing what Ithy\n",
      "-----\n",
      "('[356, 834, 536, 834, 14078, 834, 357, 32099, 153, 57, 162, 33, 187, 153, 32097, 33, 149, 149, 36, 57, 38, 36, 36, 57, 57, 33, 187, 153, 149, 57, 62, 184, 33, 187]', '[162]')\n",
      "input decoded: set_1_minus_2 <extra_id_0>” byve are mai” <extra_id_2> are how how be by as be be by by are mai” how by we& are mai\n",
      "output decoded: ve\n",
      "-----\n",
      "('[356, 834, 536, 834, 14078, 834, 357, 32099, 5, 116, 58, 116, 5, 191, 32097, 191, 191, 5, 58, 161, 161, 191, 191, 34, 110]', '[116]')\n",
      "input decoded: set_1_minus_2 <extra_id_0>. when? when. pentru <extra_id_2> pentru pentru.? work work pentru pentru it les\n",
      "output decoded: when\n",
      "\n",
      "set_2_minus_1\n",
      "0\n",
      "-----\n",
      "('[356, 834, 357, 834, 14078, 834, 536, 32099, 100, 180, 180, 179, 190, 191, 190, 186, 121, 179, 191, 81, 190, 107, 121, 190, 81, 32097, 180, 47, 179, 100, 100, 191, 190, 186, 107]', '[47]')\n",
      "input decoded: set_2_minus_1 <extra_id_0> This S Sable through pentru through many\"able pentru about throughh\" through about <extra_id_2> S wasable This This pentru through manyh\n",
      "output decoded: was\n",
      "-----\n",
      "('[356, 834, 357, 834, 14078, 834, 536, 32099, 87, 136, 136, 161, 84, 136, 161, 179, 20, 101, 84, 32097, 136, 136, 84, 161, 87, 136, 155, 84]', '[155]')\n",
      "input decoded: set_2_minus_1 <extra_id_0>/ any any work which any workable de We which <extra_id_2> any any which work/ anyit which\n",
      "output decoded: it\n",
      "-----\n",
      "('[356, 834, 357, 834, 14078, 834, 536, 32099, 58, 81, 81, 148, 38, 148, 74, 58, 38, 32097, 148, 69, 38, 69, 42, 195, 58, 74, 10, 69, 70, 195, 195]', '[195, 69, 70, 10, 42]')\n",
      "input decoded: set_2_minus_1 <extra_id_0>? about about You as You der? as <extra_id_2> You our as our orll? der: our theirllll\n",
      "output decoded: ll our their: or\n",
      "-----\n",
      "('[356, 834, 357, 834, 14078, 834, 536, 32099, 187, 187, 47, 187, 106, 9, 90, 136, 182, 96, 182, 200, 187, 96, 99, 47, 75, 136, 183, 47, 187, 9, 32097, 99, 200, 90, 47, 90]', '[32096]')\n",
      "input decoded: set_2_minus_1 <extra_id_0> mai mai was maiona le any very \" very best mai \"if wasc any am was maia <extra_id_2>if best le was le\n",
      "output decoded: <extra_id_3>\n",
      "-----\n",
      "('[356, 834, 357, 834, 14078, 834, 536, 32099, 128, 188, 128, 128, 20, 20, 32097, 128, 152, 5, 152, 148, 20, 152, 20]', '[152, 148, 5]')\n",
      "input decoded: set_2_minus_1 <extra_id_0> someA some some de de <extra_id_2> somean.an You dean de\n",
      "output decoded: an You.\n"
     ]
    }
   ],
   "source": [
    "BINARY_BASIC_TASKS = ['count', 'delete', 'filter', \n",
    "       'get_index', 'search', 'sort', \n",
    "          'replace', 'replace_many', 'union', 'intersect', 'set_1_minus_2', 'set_2_minus_1']\n",
    "\n",
    "\n",
    "for task in BINARY_BASIC_TASKS:\n",
    "    print(f'\\n{task}')\n",
    "    show_task_examples(task, num_examples=5, is_print=True)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 66,
   "metadata": {},
   "outputs": [
    {
     "ename": "ValueError",
     "evalue": "[2, 2] is not in list",
     "output_type": "error",
     "traceback": [
      "\u001b[0;31m---------------------------------------------------------------------------\u001b[0m",
      "\u001b[0;31mValueError\u001b[0m                                Traceback (most recent call last)",
      "\u001b[0;32m<ipython-input-66-692531b81ea3>\u001b[0m in \u001b[0;36m<module>\u001b[0;34m\u001b[0m\n\u001b[0;32m----> 1\u001b[0;31m \u001b[0;34m[\u001b[0m\u001b[0;36m1\u001b[0m\u001b[0;34m,\u001b[0m\u001b[0;36m2\u001b[0m\u001b[0;34m,\u001b[0m\u001b[0;36m2\u001b[0m\u001b[0;34m,\u001b[0m\u001b[0;36m3\u001b[0m\u001b[0;34m]\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mindex\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0;34m[\u001b[0m\u001b[0;36m2\u001b[0m\u001b[0;34m,\u001b[0m\u001b[0;36m2\u001b[0m\u001b[0;34m]\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0m",
      "\u001b[0;31mValueError\u001b[0m: [2, 2] is not in list"
     ]
    }
   ],
   "source": [
    "[1,2,2,3].index([2,2])"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 11,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "['a']"
      ]
     },
     "execution_count": 11,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "'a'.split(', ')"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "('last_char <extra_id_0> assessmentten Alonglapse', 'lapse')\n",
    "input token IDs [336, 834, 4059, 32099, 4193, 324, 8529, 16543]\n",
    "output token IDs [3, 16543]"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 35,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "'▁'"
      ]
     },
     "execution_count": 35,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "T.id_to_piece(3)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 16,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "[8909, 2376, 4165, 26815, 26815, 4165, 26815, 30679, 30679, 4165, 2376, 4165, 8909, 4165, 8909]\n"
     ]
    }
   ],
   "source": [
    "x = generate_set_data((4,20), TASKS_TOKEN_IDS)\n",
    "print(x)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 23,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "[8909, 2376, 4165, 26815, 26815, 4165, 26815, 30679, 30679, 4165, 2376, 4165, 8909, 4165, 8909]\n"
     ]
    }
   ],
   "source": [
    "print(TOKENIZER.encode(TOKENIZER.decode(x)))"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": []
  }
 ],
 "metadata": {
  "kernelspec": {
   "display_name": "Python 3 (ipykernel)",
   "language": "python",
   "name": "python3"
  },
  "language_info": {
   "codemirror_mode": {
    "name": "ipython",
    "version": 3
   },
   "file_extension": ".py",
   "mimetype": "text/x-python",
   "name": "python",
   "nbconvert_exporter": "python",
   "pygments_lexer": "ipython3",
   "version": "3.8.2"
  }
 },
 "nbformat": 4,
 "nbformat_minor": 2
}
