10205501401
/
brog


								{

								 "cells": [

								  {

								   "cell_type": "code",

								   "execution_count": 2,

								   "metadata": {},

								   "outputs": [],

								   "source": [

								    "import spacy\n",

								    "\n",

								    "en_grammar = spacy.load('en_core_web_sm')"

								   ]

								  },

								  {

								   "cell_type": "code",

								   "execution_count": 3,

								   "metadata": {},

								   "outputs": [

								    {

								     "name": "stderr",

								     "output_type": "stream",

								     "text": [

								      "C:\\Users\\75872\\AppData\\Local\\Programs\\Python\\Python39\\lib\\site-packages\\torchaudio\\backend\\utils.py:67: UserWarning: No audio backend is available.\n",

								      "  warnings.warn('No audio backend is available.')\n"

								     ]

								    }

								   ],

								   "source": [

								    "from transformers import *\n",

								    "\n",

								    "BERT_NAME = 'bert-base-uncased'\n",

								    "tok = BertTokenizer.from_pretrained(BERT_NAME)"

								   ]

								  },

								  {

								   "cell_type": "code",

								   "execution_count": 5,

								   "metadata": {},

								   "outputs": [

								    {

								     "name": "stdout",

								     "output_type": "stream",

								     "text": [

								      "\n",

								      "when [[]] ceases and [[]] makes [[]] toward [[]], [[]] says to drop [[]] after [[]] starting at [[]].\n",

								      "\n"

								     ]

								    }

								   ],

								   "source": [

								    "sentence = \"\"\"\n",

								    "when the drop ceases and the curve makes an elbow toward less steep decline, cattell's scree test says to drop all further components after the one starting at the elbow.\n",

								    "\"\"\"\n",

								    "def maskNoun(sentence: str):\n",

								    "    doc = en_grammar(sentence)\n",

								    "    for noun in doc.noun_chunks:\n",

								    "        sentence = sentence.replace(noun.text, '[[]]', 1)\n",

								    "    return sentence\n",

								    "print(maskNoun(sentence))"

								   ]

								  },

								  {

								   "cell_type": "code",

								   "execution_count": 29,

								   "metadata": {},

								   "outputs": [

								    {

								     "name": "stdout",

								     "output_type": "stream",

								     "text": [

								      "['when', 'the', 'drop', 'cease', '##s', 'and', 'the', 'curve', 'makes', 'an', 'elbow', 'toward', 'less', 'steep', 'decline', ',', 'cat', '##tell', \"'\", 's', 'sc', '##ree', 'test', 'says', 'to', 'drop', 'all', 'further', 'components', 'after', 'the', 'one', 'starting', 'at', 'the', 'elbow', '.']\n",

								      "['when', '[', '[', ']', ']', 'cease', '##s', 'and', '[', '[', ']', ']', 'makes', '[', '[', ']', ']', 'toward', '[', '[', ']', ']', ',', '[', '[', ']', ']', 'says', 'to', 'drop', '[', '[', ']', ']', 'after', '[', '[', ']', ']', 'starting', 'at', '[', '[', ']', ']', '.']\n"

								     ]

								    }

								   ],

								   "source": [

								    "tokList = tok.tokenize(sentence)\n",

								    "tokMask = tok.tokenize(maskNoun(sentence))\n",

								    "\n",

								    "print(tokList)\n",

								    "print(tokMask)"

								   ]

								  },

								  {

								   "cell_type": "code",

								   "execution_count": 32,

								   "metadata": {},

								   "outputs": [

								    {

								     "name": "stdout",

								     "output_type": "stream",

								     "text": [

								      "['when', '[MASK]', '[MASK]', 'cease', '##s', 'and', '[MASK]', '[MASK]', 'makes', '[MASK]', '[MASK]', 'toward', '[MASK]', '[MASK]', '[MASK]', ',', '[MASK]', '[MASK]', '[MASK]', '[MASK]', '[MASK]', '[MASK]', '[MASK]', 'says', 'to', 'drop', '[MASK]', '[MASK]', '[MASK]', 'after', '[MASK]', '[MASK]', 'starting', 'at', '[MASK]', '[MASK]', '.']\n",

								      "['when', '[', '[', ']', ']', 'cease', '##s', 'and', '[', '[', ']', ']', 'makes', '[', '[', ']', ']', 'toward', '[', '[', ']', ']', ',', '[', '[', ']', ']', 'says', 'to', 'drop', '[', '[', ']', ']', 'after', '[', '[', ']', ']', 'starting', 'at', '[', '[', ']', ']', '.']\n"

								     ]

								    }

								   ],

								   "source": [

								    "def maskTokList(self, tokList, tokMask):\n",

								    "    off = 0\n",

								    "    for idx, elem in enumerate(tokList):\n",

								    "        idx_ = idx + off\n",

								    "        flag = ''.join(tokMask[idx_: idx_+4])\n",

								    "        if flag == '[[]]':\n",

								    "            if tokMask[idx_+4] == elem:\n",

								    "                off += 4\n",

								    "                continue\n",

								    "            tokList[idx] = '[MASK]'\n",

								    "            off -= 1\n",

								    "    return tokList\n",

								    "\n",

								    "\n",

								    "print(tokList)\n",

								    "print(tokMask)\n"

								   ]

								  }

								 ],

								 "metadata": {

								  "interpreter": {

								   "hash": "f29e8b3fa2d991a6f8847b235850bc2cfc73e5042ba8efb84ff0f4dcd41902ea"

								  },

								  "kernelspec": {

								   "display_name": "Python 3.9.6 64-bit",

								   "language": "python",

								   "name": "python3"

								  },

								  "language_info": {

								   "codemirror_mode": {

								    "name": "ipython",

								    "version": 3

								   },

								   "file_extension": ".py",

								   "mimetype": "text/x-python",

								   "name": "python",

								   "nbconvert_exporter": "python",

								   "pygments_lexer": "ipython3",

								   "version": "3.9.6"

								  },

								  "orig_nbformat": 4

								 },

								 "nbformat": 4,

								 "nbformat_minor": 2

								}