{
|
|
"cells": [
|
|
{
|
|
"cell_type": "code",
|
|
"execution_count": 2,
|
|
"metadata": {},
|
|
"outputs": [],
|
|
"source": [
|
|
"import spacy\n",
|
|
"\n",
|
|
"en_grammar = spacy.load('en_core_web_sm')"
|
|
]
|
|
},
|
|
{
|
|
"cell_type": "code",
|
|
"execution_count": 3,
|
|
"metadata": {},
|
|
"outputs": [
|
|
{
|
|
"name": "stderr",
|
|
"output_type": "stream",
|
|
"text": [
|
|
"C:\\Users\\75872\\AppData\\Local\\Programs\\Python\\Python39\\lib\\site-packages\\torchaudio\\backend\\utils.py:67: UserWarning: No audio backend is available.\n",
|
|
" warnings.warn('No audio backend is available.')\n"
|
|
]
|
|
}
|
|
],
|
|
"source": [
|
|
"from transformers import *\n",
|
|
"\n",
|
|
"BERT_NAME = 'bert-base-uncased'\n",
|
|
"tok = BertTokenizer.from_pretrained(BERT_NAME)"
|
|
]
|
|
},
|
|
{
|
|
"cell_type": "code",
|
|
"execution_count": 5,
|
|
"metadata": {},
|
|
"outputs": [
|
|
{
|
|
"name": "stdout",
|
|
"output_type": "stream",
|
|
"text": [
|
|
"\n",
|
|
"when [[]] ceases and [[]] makes [[]] toward [[]], [[]] says to drop [[]] after [[]] starting at [[]].\n",
|
|
"\n"
|
|
]
|
|
}
|
|
],
|
|
"source": [
|
|
"sentence = \"\"\"\n",
|
|
"when the drop ceases and the curve makes an elbow toward less steep decline, cattell's scree test says to drop all further components after the one starting at the elbow.\n",
|
|
"\"\"\"\n",
|
|
"def maskNoun(sentence: str):\n",
|
|
" doc = en_grammar(sentence)\n",
|
|
" for noun in doc.noun_chunks:\n",
|
|
" sentence = sentence.replace(noun.text, '[[]]', 1)\n",
|
|
" return sentence\n",
|
|
"print(maskNoun(sentence))"
|
|
]
|
|
},
|
|
{
|
|
"cell_type": "code",
|
|
"execution_count": 29,
|
|
"metadata": {},
|
|
"outputs": [
|
|
{
|
|
"name": "stdout",
|
|
"output_type": "stream",
|
|
"text": [
|
|
"['when', 'the', 'drop', 'cease', '##s', 'and', 'the', 'curve', 'makes', 'an', 'elbow', 'toward', 'less', 'steep', 'decline', ',', 'cat', '##tell', \"'\", 's', 'sc', '##ree', 'test', 'says', 'to', 'drop', 'all', 'further', 'components', 'after', 'the', 'one', 'starting', 'at', 'the', 'elbow', '.']\n",
|
|
"['when', '[', '[', ']', ']', 'cease', '##s', 'and', '[', '[', ']', ']', 'makes', '[', '[', ']', ']', 'toward', '[', '[', ']', ']', ',', '[', '[', ']', ']', 'says', 'to', 'drop', '[', '[', ']', ']', 'after', '[', '[', ']', ']', 'starting', 'at', '[', '[', ']', ']', '.']\n"
|
|
]
|
|
}
|
|
],
|
|
"source": [
|
|
"tokList = tok.tokenize(sentence)\n",
|
|
"tokMask = tok.tokenize(maskNoun(sentence))\n",
|
|
"\n",
|
|
"print(tokList)\n",
|
|
"print(tokMask)"
|
|
]
|
|
},
|
|
{
|
|
"cell_type": "code",
|
|
"execution_count": 32,
|
|
"metadata": {},
|
|
"outputs": [
|
|
{
|
|
"name": "stdout",
|
|
"output_type": "stream",
|
|
"text": [
|
|
"['when', '[MASK]', '[MASK]', 'cease', '##s', 'and', '[MASK]', '[MASK]', 'makes', '[MASK]', '[MASK]', 'toward', '[MASK]', '[MASK]', '[MASK]', ',', '[MASK]', '[MASK]', '[MASK]', '[MASK]', '[MASK]', '[MASK]', '[MASK]', 'says', 'to', 'drop', '[MASK]', '[MASK]', '[MASK]', 'after', '[MASK]', '[MASK]', 'starting', 'at', '[MASK]', '[MASK]', '.']\n",
|
|
"['when', '[', '[', ']', ']', 'cease', '##s', 'and', '[', '[', ']', ']', 'makes', '[', '[', ']', ']', 'toward', '[', '[', ']', ']', ',', '[', '[', ']', ']', 'says', 'to', 'drop', '[', '[', ']', ']', 'after', '[', '[', ']', ']', 'starting', 'at', '[', '[', ']', ']', '.']\n"
|
|
]
|
|
}
|
|
],
|
|
"source": [
|
|
"def maskTokList(self, tokList, tokMask):\n",
|
|
" off = 0\n",
|
|
" for idx, elem in enumerate(tokList):\n",
|
|
" idx_ = idx + off\n",
|
|
" flag = ''.join(tokMask[idx_: idx_+4])\n",
|
|
" if flag == '[[]]':\n",
|
|
" if tokMask[idx_+4] == elem:\n",
|
|
" off += 4\n",
|
|
" continue\n",
|
|
" tokList[idx] = '[MASK]'\n",
|
|
" off -= 1\n",
|
|
" return tokList\n",
|
|
"\n",
|
|
"\n",
|
|
"print(tokList)\n",
|
|
"print(tokMask)\n"
|
|
]
|
|
}
|
|
],
|
|
"metadata": {
|
|
"interpreter": {
|
|
"hash": "f29e8b3fa2d991a6f8847b235850bc2cfc73e5042ba8efb84ff0f4dcd41902ea"
|
|
},
|
|
"kernelspec": {
|
|
"display_name": "Python 3.9.6 64-bit",
|
|
"language": "python",
|
|
"name": "python3"
|
|
},
|
|
"language_info": {
|
|
"codemirror_mode": {
|
|
"name": "ipython",
|
|
"version": 3
|
|
},
|
|
"file_extension": ".py",
|
|
"mimetype": "text/x-python",
|
|
"name": "python",
|
|
"nbconvert_exporter": "python",
|
|
"pygments_lexer": "ipython3",
|
|
"version": "3.9.6"
|
|
},
|
|
"orig_nbformat": 4
|
|
},
|
|
"nbformat": 4,
|
|
"nbformat_minor": 2
|
|
}
|