where pure knowledge is acquired by just reading
You can not select more than 25 topics Topics must start with a letter or number, can include dashes ('-') and can be up to 35 characters long.
 
 

57 lines
1.7 KiB

import nltk
import os
import json
from random import randint
def splitSubSection(sent):
def takeEven(lst):
for i, elem in enumerate(lst):
if i % 2 == 0:
yield elem
for tok in ['======', '=====', '====', '===']:
sent = '. '.join(takeEven(sent.split(tok)))
return sent
def parseUniSection(concept, uniSection):
text = f'[@@){concept}(@@]'.join(uniSection['text'].split(concept))
for i, link in enumerate(uniSection['links']):
text = text.replace(
uniSection['text'][link['pos_start']:link['pos_end']],
f'[@|){link["text"]}(|@]', 1
)
text = text.replace(f'=={uniSection["title"]}==', '').replace('\n', '. ')
text = splitSubSection(text)
text.replace('|', '')
yield from nltk.sent_tokenize(text)
def parseUniJSON(uniJSON):
for uniConcept in uniJSON:
concept = uniConcept['title']
for section in uniConcept['sections']:
for sent in parseUniSection(concept, section):
if len(sent.split()) < 32:
continue
yield sent
if __name__ == '__main__':
dirList = os.listdir('Dataset')
out = open('train.csv', 'w', encoding='utf-8')
out.write('0\n')
for dirName in dirList:
with open(f'Dataset/{dirName}', 'r', encoding='utf-8') as f:
uniJSON = json.loads(f.read().lower())
for sent in parseUniJSON(uniJSON):
out.write(
'\"' +
sent.replace('\n', ' ')
.replace('\"', '\\\"')
.replace('*', '')
+ '\"\n\n'
)
out.close()