# import re # import torch # import torch.nn as nn # import jieba # import pandas as pd # # from torchtext import data # # class LSTMNet(nn.Module): # def __init__(self,vocab_size,embedding_dim,hidden_dim,layer_dim,output_dim): # super(LSTMNet,self).__init__() # self.hidden_dim= hidden_dim # self.layer_dim = layer_dim # self.embedding = nn.Embedding(vocab_size,embedding_dim) # # LSTM+全连接 # self.lstm = nn.LSTM(embedding_dim,hidden_dim,layer_dim, # batch_first=True) # self.fcl= nn.Linear(hidden_dim,output_dim) # def forward(self,x): # embeds = self.embedding(x) # r_out,(h_n,h_c)=self.lstm(embeds,None) # out = self.fcl(r_out[:,-1,:]) # return out # # def Chinese_pre(text_data,stopwords): # # 字母转化为小写, 去掉数字 # text_data = text_data.lower() # text_data = re.sub("\d+","",text_data) # # 分词,使用精确模式 # text_data = list(jieba.cut(text_data,cut_all = False)) # # 去除停用词和多余空格 # text_data = [word.strip() for word in text_data if word not in stopwords] # # 处理后的词语使用空格连接为字符串 # text_data = " ".join(text_data) # return text_data # # def TexttoLable(textdata): # # 将输入文本转为tensor # # 首先对文本进行分词 # from nltk.corpus import stopwords # import nltk # nltk.download('stopwords') # words = stopwords.words('english') # stopwords = set() # with open("stop.txt",encoding="utf-8") as infile: # for line in infile: # line = line.rstrip('\n') # if line: # stopwords.add(line.lower()) # for i in words: # stopwords.add(i) # textdata=Chinese_pre(textdata,stopwords) # # data1=[] # for i in range(128): # data1.append(textdata) # df = pd.DataFrame({'cutword':data1}) # # df.to_csv("tmp.csv") # # mytokenize = lambda x:x.split() # from torchtext.legacy import data # TEXT = data.Field(sequential = True,tokenize = mytokenize, # include_lengths=True,use_vocab=True, # batch_first=True,fix_length=40) # # LABEL = data.Field(sequential =False,use_vocab=False, # pad_token=None,unk_token=None) # # 对所有读取的数据集的列进行处理 # text_data_fields = [ # ("labelcode",LABEL), # ("cutword",TEXT) # ] # # 读取数据 # # 读取数据 # traindata,valdata,testdata = data.TabularDataset.splits( # path="./",format="csv",train="tmp.csv",fields = text_data_fields, # validation = "tmp.csv", # test ="tmp.csv",skip_header=True # ) # # em = testdata.examples[0] # TEXT.build_vocab(traindata,max_size=100,vectors=None) # # # 定义一个迭代器,将类似长度的示例一起批处理 # BATCH_SIZE=128 # test_iter = data.BucketIterator(testdata,batch_size=128) # # # vocab_size=len(TEXT.vocab) # embedding_dim=50 # hidden_dim=256 # layer_dim=1 # output_dim=4 # lstmmodel = LSTMNet(vocab_size, embedding_dim, hidden_dim, layer_dim, output_dim) # # res=0 # model = torch.load("model.pkl") # for step,batch in enumerate(test_iter): # textfinal = batch.cutword[0] # out = model(textfinal) # pre_lab = torch.argmax(out,1) # res = pre_lab[0] # print(res.numpy()) # # TexttoLable("萝卜云服交流群等3个会话 ")