import nltk
|
|
import os
|
|
import json
|
|
from random import randint
|
|
|
|
|
|
def splitSubSection(sent):
|
|
def takeEven(lst):
|
|
for i, elem in enumerate(lst):
|
|
if i % 2 == 0:
|
|
yield elem
|
|
for tok in ['======', '=====', '====', '===']:
|
|
sent = '. '.join(takeEven(sent.split(tok)))
|
|
return sent
|
|
|
|
|
|
def parseUniSection(concept, uniSection):
|
|
text = f'[@@){concept}(@@]'.join(uniSection['text'].split(concept))
|
|
for i, link in enumerate(uniSection['links']):
|
|
text = text.replace(
|
|
uniSection['text'][link['pos_start']:link['pos_end']],
|
|
f'[@|){link["text"]}(|@]', 1
|
|
)
|
|
text = text.replace(f'=={uniSection["title"]}==', '').replace('\n', '. ')
|
|
text = splitSubSection(text)
|
|
text.replace('|', '')
|
|
yield from nltk.sent_tokenize(text)
|
|
|
|
|
|
def parseUniJSON(uniJSON):
|
|
for uniConcept in uniJSON:
|
|
concept = uniConcept['title']
|
|
for section in uniConcept['sections']:
|
|
for sent in parseUniSection(concept, section):
|
|
if len(sent.split()) < 32:
|
|
continue
|
|
yield sent
|
|
|
|
|
|
if __name__ == '__main__':
|
|
|
|
dirList = os.listdir('Dataset')
|
|
|
|
out = open('train.csv', 'w', encoding='utf-8')
|
|
out.write('0\n')
|
|
for dirName in dirList:
|
|
with open(f'Dataset/{dirName}', 'r', encoding='utf-8') as f:
|
|
uniJSON = json.loads(f.read().lower())
|
|
for sent in parseUniJSON(uniJSON):
|
|
out.write(
|
|
'\"' +
|
|
sent.replace('\n', ' ')
|
|
.replace('\"', '\\\"')
|
|
.replace('*', '')
|
|
+ '\"\n\n'
|
|
)
|
|
out.close()
|