where pure knowledge is acquired by just reading
You can not select more than 25 topics Topics must start with a letter or number, can include dashes ('-') and can be up to 35 characters long.
 
 

143 lines
4.4 KiB

{
"cells": [
{
"cell_type": "code",
"execution_count": 2,
"metadata": {},
"outputs": [],
"source": [
"import spacy\n",
"\n",
"en_grammar = spacy.load('en_core_web_sm')"
]
},
{
"cell_type": "code",
"execution_count": 3,
"metadata": {},
"outputs": [
{
"name": "stderr",
"output_type": "stream",
"text": [
"C:\\Users\\75872\\AppData\\Local\\Programs\\Python\\Python39\\lib\\site-packages\\torchaudio\\backend\\utils.py:67: UserWarning: No audio backend is available.\n",
" warnings.warn('No audio backend is available.')\n"
]
}
],
"source": [
"from transformers import *\n",
"\n",
"BERT_NAME = 'bert-base-uncased'\n",
"tok = BertTokenizer.from_pretrained(BERT_NAME)"
]
},
{
"cell_type": "code",
"execution_count": 5,
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"\n",
"when [[]] ceases and [[]] makes [[]] toward [[]], [[]] says to drop [[]] after [[]] starting at [[]].\n",
"\n"
]
}
],
"source": [
"sentence = \"\"\"\n",
"when the drop ceases and the curve makes an elbow toward less steep decline, cattell's scree test says to drop all further components after the one starting at the elbow.\n",
"\"\"\"\n",
"def maskNoun(sentence: str):\n",
" doc = en_grammar(sentence)\n",
" for noun in doc.noun_chunks:\n",
" sentence = sentence.replace(noun.text, '[[]]', 1)\n",
" return sentence\n",
"print(maskNoun(sentence))"
]
},
{
"cell_type": "code",
"execution_count": 29,
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"['when', 'the', 'drop', 'cease', '##s', 'and', 'the', 'curve', 'makes', 'an', 'elbow', 'toward', 'less', 'steep', 'decline', ',', 'cat', '##tell', \"'\", 's', 'sc', '##ree', 'test', 'says', 'to', 'drop', 'all', 'further', 'components', 'after', 'the', 'one', 'starting', 'at', 'the', 'elbow', '.']\n",
"['when', '[', '[', ']', ']', 'cease', '##s', 'and', '[', '[', ']', ']', 'makes', '[', '[', ']', ']', 'toward', '[', '[', ']', ']', ',', '[', '[', ']', ']', 'says', 'to', 'drop', '[', '[', ']', ']', 'after', '[', '[', ']', ']', 'starting', 'at', '[', '[', ']', ']', '.']\n"
]
}
],
"source": [
"tokList = tok.tokenize(sentence)\n",
"tokMask = tok.tokenize(maskNoun(sentence))\n",
"\n",
"print(tokList)\n",
"print(tokMask)"
]
},
{
"cell_type": "code",
"execution_count": 32,
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"['when', '[MASK]', '[MASK]', 'cease', '##s', 'and', '[MASK]', '[MASK]', 'makes', '[MASK]', '[MASK]', 'toward', '[MASK]', '[MASK]', '[MASK]', ',', '[MASK]', '[MASK]', '[MASK]', '[MASK]', '[MASK]', '[MASK]', '[MASK]', 'says', 'to', 'drop', '[MASK]', '[MASK]', '[MASK]', 'after', '[MASK]', '[MASK]', 'starting', 'at', '[MASK]', '[MASK]', '.']\n",
"['when', '[', '[', ']', ']', 'cease', '##s', 'and', '[', '[', ']', ']', 'makes', '[', '[', ']', ']', 'toward', '[', '[', ']', ']', ',', '[', '[', ']', ']', 'says', 'to', 'drop', '[', '[', ']', ']', 'after', '[', '[', ']', ']', 'starting', 'at', '[', '[', ']', ']', '.']\n"
]
}
],
"source": [
"def maskTokList(self, tokList, tokMask):\n",
" off = 0\n",
" for idx, elem in enumerate(tokList):\n",
" idx_ = idx + off\n",
" flag = ''.join(tokMask[idx_: idx_+4])\n",
" if flag == '[[]]':\n",
" if tokMask[idx_+4] == elem:\n",
" off += 4\n",
" continue\n",
" tokList[idx] = '[MASK]'\n",
" off -= 1\n",
" return tokList\n",
"\n",
"\n",
"print(tokList)\n",
"print(tokMask)\n"
]
}
],
"metadata": {
"interpreter": {
"hash": "f29e8b3fa2d991a6f8847b235850bc2cfc73e5042ba8efb84ff0f4dcd41902ea"
},
"kernelspec": {
"display_name": "Python 3.9.6 64-bit",
"language": "python",
"name": "python3"
},
"language_info": {
"codemirror_mode": {
"name": "ipython",
"version": 3
},
"file_extension": ".py",
"mimetype": "text/x-python",
"name": "python",
"nbconvert_exporter": "python",
"pygments_lexer": "ipython3",
"version": "3.9.6"
},
"orig_nbformat": 4
},
"nbformat": 4,
"nbformat_minor": 2
}