|
{
|
|
"cells": [
|
|
{
|
|
"cell_type": "code",
|
|
"execution_count": 1,
|
|
"metadata": {},
|
|
"outputs": [
|
|
{
|
|
"name": "stdout",
|
|
"output_type": "stream",
|
|
"text": [
|
|
"<bound method NDFrame.head of 0 1\n",
|
|
"0 ham Go until jurong point, crazy.. Available only ...\n",
|
|
"1 ham Ok lar... Joking wif u oni...\n",
|
|
"2 ham Even my brother is not like to speak with me. ...\n",
|
|
"3 ham As per your request 'Melle Melle (Oru Minnamin...\n",
|
|
"4 spam England v Macedonia - dont miss the goalsteam ...\n",
|
|
"5 spam Congrats! 1 year special cinema pass for 2 is ...\n",
|
|
"6 ham Sorry, I'll call later in meeting.\n",
|
|
"7 ham Tell where you reached\n",
|
|
"8 ham Yes..gauti and sehwag out of odi series.\n",
|
|
"9 spam Please call our customer service representativ...\n",
|
|
"10 ham Havent planning to buy later. I check already ...\n",
|
|
"11 spam Your free ringtone is waiting to be collected....\n",
|
|
"12 ham How would my ip address test that considering ...\n",
|
|
"13 ham I know! Grumpy old people. My mom was like you...\n",
|
|
"14 ham Dont worry. I guess he's busy.\n",
|
|
"15 ham What is the plural of the noun research?\n",
|
|
"16 ham Going for dinner.msg you after.\n",
|
|
"17 ham I'm ok wif it cos i like 2 try new things. But...\n",
|
|
"18 spam GENT! We are trying to contact you. Last weeke...\n",
|
|
"19 ham K..k:)how much does it cost?\n",
|
|
"20 ham I'm home.\n",
|
|
"21 ham Dear, will call Tmorrow.pls accomodate.\n",
|
|
"22 ham First answer my question.\n",
|
|
"23 spam Sunshine Quiz Wkly Q! Win a top Sony DVD playe...\n",
|
|
"24 ham K, text me when you're on the way\n",
|
|
"25 ham Sir, Waiting for your mail.\n",
|
|
"26 spam Customer service annoncement. You have a New Y...\n",
|
|
"27 spam You are a winner U have been specially selecte...\n",
|
|
"28 ham Keep yourself safe for me because I need you a...\n",
|
|
"29 spam URGENT! We are trying to contact you. Last wee...>\n",
|
|
"垃圾邮件个数:9\n",
|
|
"正常邮件个数:21\n"
|
|
]
|
|
}
|
|
],
|
|
"source": [
|
|
"import pandas as pd\n",
|
|
"import numpy as np\n",
|
|
"from sklearn.feature_extraction.text import TfidfVectorizer\n",
|
|
"from sklearn.linear_model.logistic import LogisticRegression\n",
|
|
"from sklearn.model_selection import train_test_split,cross_val_score\n",
|
|
"df = pd.read_csv('SMSSpamCollection.csv',header=None,encoding='utf-8')\n",
|
|
"print(df.head)\n",
|
|
"print(\"垃圾邮件个数:%s\" % df[df[0]=='spam'][0].count())\n",
|
|
"print(\"正常邮件个数:%s\" % df[df[0]=='ham'][0].count())"
|
|
]
|
|
},
|
|
{
|
|
"cell_type": "code",
|
|
"execution_count": 2,
|
|
"metadata": {},
|
|
"outputs": [],
|
|
"source": [
|
|
"X = df[1].values\n",
|
|
"y = df[0].values\n",
|
|
"X_train_raw,X_test_raw,y_train,y_test=train_test_split(X,y)\n",
|
|
"vectorizer = TfidfVectorizer()\n",
|
|
"X_train = vectorizer.fit_transform(X_train_raw)\n",
|
|
"X_test = vectorizer.transform(X_test_raw)"
|
|
]
|
|
},
|
|
{
|
|
"cell_type": "code",
|
|
"execution_count": 3,
|
|
"metadata": {},
|
|
"outputs": [
|
|
{
|
|
"name": "stdout",
|
|
"output_type": "stream",
|
|
"text": [
|
|
"预测为 ham ,信件为 Sunshine Quiz Wkly Q! Win a top Sony DVD player if u know which country the Algarve is in? Txt ansr to 82277. 1.50 SP:Tyrone\n",
|
|
"预测为 ham ,信件为 As per your request 'Melle Melle (Oru Minnaminunginte Nurungu Vettam)' has been set as your callertune for all Callers. Press *9 to copy your friends Callertune\n",
|
|
"预测为 ham ,信件为 GENT! We are trying to contact you. Last weekends draw shows that you won a 1000 prize GUARANTEED. Call 09064012160. Claim Code K52. Valid 12hrs only. 150ppm\n",
|
|
"预测为 ham ,信件为 Customer service annoncement. You have a New Years delivery waiting for you. Please call 07046744435 now to arrange delivery\n",
|
|
"预测为 ham ,信件为 Yes..gauti and sehwag out of odi series.\n"
|
|
]
|
|
}
|
|
],
|
|
"source": [
|
|
"LR = LogisticRegression()\n",
|
|
"LR.fit(X_train,y_train)\n",
|
|
"predictions = LR.predict(X_test)\n",
|
|
"for i,prediction in enumerate(predictions[:5]):\n",
|
|
" print(\"预测为 %s ,信件为 %s\" % (prediction,X_test_raw[i]))"
|
|
]
|
|
},
|
|
{
|
|
"cell_type": "code",
|
|
"execution_count": 4,
|
|
"metadata": {},
|
|
"outputs": [
|
|
{
|
|
"name": "stdout",
|
|
"output_type": "stream",
|
|
"text": [
|
|
" precision recall f1-score support\n",
|
|
"\n",
|
|
" ham 0.50 1.00 0.67 4\n",
|
|
" spam 0.00 0.00 0.00 4\n",
|
|
"\n",
|
|
"avg / total 0.25 0.50 0.33 8\n",
|
|
"\n",
|
|
"准确率为: [0.8 0.8 0.75 0.75 0.75]\n",
|
|
"平均准确率为: 0.77\n",
|
|
"平均精准率为: 0.0\n",
|
|
"平均召回率为: 0.0\n",
|
|
"平均F1值为: 0.0\n"
|
|
]
|
|
},
|
|
{
|
|
"name": "stderr",
|
|
"output_type": "stream",
|
|
"text": [
|
|
"C:\\Users\\52257\\Anaconda3\\lib\\site-packages\\sklearn\\metrics\\classification.py:1135: UndefinedMetricWarning: Precision and F-score are ill-defined and being set to 0.0 in labels with no predicted samples.\n",
|
|
" 'precision', 'predicted', average, warn_for)\n",
|
|
"C:\\Users\\52257\\Anaconda3\\lib\\site-packages\\sklearn\\metrics\\classification.py:1135: UndefinedMetricWarning: Precision is ill-defined and being set to 0.0 due to no predicted samples.\n",
|
|
" 'precision', 'predicted', average, warn_for)\n",
|
|
"C:\\Users\\52257\\Anaconda3\\lib\\site-packages\\sklearn\\metrics\\classification.py:1135: UndefinedMetricWarning: Precision is ill-defined and being set to 0.0 due to no predicted samples.\n",
|
|
" 'precision', 'predicted', average, warn_for)\n",
|
|
"C:\\Users\\52257\\Anaconda3\\lib\\site-packages\\sklearn\\metrics\\classification.py:1135: UndefinedMetricWarning: Precision is ill-defined and being set to 0.0 due to no predicted samples.\n",
|
|
" 'precision', 'predicted', average, warn_for)\n",
|
|
"C:\\Users\\52257\\Anaconda3\\lib\\site-packages\\sklearn\\metrics\\classification.py:1135: UndefinedMetricWarning: Precision is ill-defined and being set to 0.0 due to no predicted samples.\n",
|
|
" 'precision', 'predicted', average, warn_for)\n",
|
|
"C:\\Users\\52257\\Anaconda3\\lib\\site-packages\\sklearn\\metrics\\classification.py:1135: UndefinedMetricWarning: Precision is ill-defined and being set to 0.0 due to no predicted samples.\n",
|
|
" 'precision', 'predicted', average, warn_for)\n",
|
|
"C:\\Users\\52257\\Anaconda3\\lib\\site-packages\\sklearn\\metrics\\classification.py:1135: UndefinedMetricWarning: F-score is ill-defined and being set to 0.0 due to no predicted samples.\n",
|
|
" 'precision', 'predicted', average, warn_for)\n",
|
|
"C:\\Users\\52257\\Anaconda3\\lib\\site-packages\\sklearn\\metrics\\classification.py:1135: UndefinedMetricWarning: F-score is ill-defined and being set to 0.0 due to no predicted samples.\n",
|
|
" 'precision', 'predicted', average, warn_for)\n",
|
|
"C:\\Users\\52257\\Anaconda3\\lib\\site-packages\\sklearn\\metrics\\classification.py:1135: UndefinedMetricWarning: F-score is ill-defined and being set to 0.0 due to no predicted samples.\n",
|
|
" 'precision', 'predicted', average, warn_for)\n",
|
|
"C:\\Users\\52257\\Anaconda3\\lib\\site-packages\\sklearn\\metrics\\classification.py:1135: UndefinedMetricWarning: F-score is ill-defined and being set to 0.0 due to no predicted samples.\n",
|
|
" 'precision', 'predicted', average, warn_for)\n",
|
|
"C:\\Users\\52257\\Anaconda3\\lib\\site-packages\\sklearn\\metrics\\classification.py:1135: UndefinedMetricWarning: F-score is ill-defined and being set to 0.0 due to no predicted samples.\n",
|
|
" 'precision', 'predicted', average, warn_for)\n"
|
|
]
|
|
}
|
|
],
|
|
"source": [
|
|
"from sklearn.metrics import classification_report\n",
|
|
"print(classification_report(y_test,predictions))\n",
|
|
"\n",
|
|
"from sklearn.metrics import roc_curve,auc\n",
|
|
"# 准确率\n",
|
|
"scores = cross_val_score(LR,X_train,y_train,cv=5)\n",
|
|
"print(\"准确率为: \",scores)\n",
|
|
"print(\"平均准确率为: \",np.mean(scores))\n",
|
|
"\n",
|
|
"# 有时必须要将标签转为数值\n",
|
|
"from sklearn.preprocessing import LabelEncoder\n",
|
|
"class_le = LabelEncoder()\n",
|
|
"y_train_n = class_le.fit_transform(y_train)\n",
|
|
"y_test_n = class_le.fit_transform(y_test)\n",
|
|
"\n",
|
|
"# 精准率\n",
|
|
"precision = cross_val_score(LR,X_train,y_train_n,cv=5,scoring='precision')\n",
|
|
"print(\"平均精准率为: \",np.mean(precision))\n",
|
|
"# 召回率\n",
|
|
"recall = cross_val_score(LR,X_train,y_train_n,cv=5,scoring='recall')\n",
|
|
"print(\"平均召回率为: \",np.mean(recall)) \n",
|
|
"# F1值\n",
|
|
"f1 = cross_val_score(LR,X_train,y_train_n,cv=5,scoring='f1')\n",
|
|
"print(\"平均F1值为: \",np.mean(f1)) "
|
|
]
|
|
},
|
|
{
|
|
"cell_type": "markdown",
|
|
"metadata": {},
|
|
"source": [
|
|
"#### 警报通常是由于系统对某种标签没有找到。尝试将数据集中加入更多的数据,再次运行,查看结果。"
|
|
]
|
|
},
|
|
{
|
|
"cell_type": "code",
|
|
"execution_count": null,
|
|
"metadata": {},
|
|
"outputs": [],
|
|
"source": []
|
|
}
|
|
],
|
|
"metadata": {
|
|
"kernelspec": {
|
|
"display_name": "Python 3",
|
|
"language": "python",
|
|
"name": "python3"
|
|
},
|
|
"language_info": {
|
|
"codemirror_mode": {
|
|
"name": "ipython",
|
|
"version": 3
|
|
},
|
|
"file_extension": ".py",
|
|
"mimetype": "text/x-python",
|
|
"name": "python",
|
|
"nbconvert_exporter": "python",
|
|
"pygments_lexer": "ipython3",
|
|
"version": "3.6.5"
|
|
}
|
|
},
|
|
"nbformat": 4,
|
|
"nbformat_minor": 2
|
|
}
|