|
|
@ -0,0 +1,214 @@ |
|
|
|
{ |
|
|
|
"cells": [ |
|
|
|
{ |
|
|
|
"cell_type": "code", |
|
|
|
"execution_count": 1, |
|
|
|
"metadata": {}, |
|
|
|
"outputs": [ |
|
|
|
{ |
|
|
|
"name": "stdout", |
|
|
|
"output_type": "stream", |
|
|
|
"text": [ |
|
|
|
"<bound method NDFrame.head of 0 1\n", |
|
|
|
"0 ham Go until jurong point, crazy.. Available only ...\n", |
|
|
|
"1 ham Ok lar... Joking wif u oni...\n", |
|
|
|
"2 ham Even my brother is not like to speak with me. ...\n", |
|
|
|
"3 ham As per your request 'Melle Melle (Oru Minnamin...\n", |
|
|
|
"4 spam England v Macedonia - dont miss the goalsteam ...\n", |
|
|
|
"5 spam Congrats! 1 year special cinema pass for 2 is ...\n", |
|
|
|
"6 ham Sorry, I'll call later in meeting.\n", |
|
|
|
"7 ham Tell where you reached\n", |
|
|
|
"8 ham Yes..gauti and sehwag out of odi series.\n", |
|
|
|
"9 spam Please call our customer service representativ...\n", |
|
|
|
"10 ham Havent planning to buy later. I check already ...\n", |
|
|
|
"11 spam Your free ringtone is waiting to be collected....\n", |
|
|
|
"12 ham How would my ip address test that considering ...\n", |
|
|
|
"13 ham I know! Grumpy old people. My mom was like you...\n", |
|
|
|
"14 ham Dont worry. I guess he's busy.\n", |
|
|
|
"15 ham What is the plural of the noun research?\n", |
|
|
|
"16 ham Going for dinner.msg you after.\n", |
|
|
|
"17 ham I'm ok wif it cos i like 2 try new things. But...\n", |
|
|
|
"18 spam GENT! We are trying to contact you. Last weeke...\n", |
|
|
|
"19 ham K..k:)how much does it cost?\n", |
|
|
|
"20 ham I'm home.\n", |
|
|
|
"21 ham Dear, will call Tmorrow.pls accomodate.\n", |
|
|
|
"22 ham First answer my question.\n", |
|
|
|
"23 spam Sunshine Quiz Wkly Q! Win a top Sony DVD playe...\n", |
|
|
|
"24 ham K, text me when you're on the way\n", |
|
|
|
"25 ham Sir, Waiting for your mail.\n", |
|
|
|
"26 spam Customer service annoncement. You have a New Y...\n", |
|
|
|
"27 spam You are a winner U have been specially selecte...\n", |
|
|
|
"28 ham Keep yourself safe for me because I need you a...\n", |
|
|
|
"29 spam URGENT! We are trying to contact you. Last wee...>\n", |
|
|
|
"垃圾邮件个数:9\n", |
|
|
|
"正常邮件个数:21\n" |
|
|
|
] |
|
|
|
} |
|
|
|
], |
|
|
|
"source": [ |
|
|
|
"import pandas as pd\n", |
|
|
|
"import numpy as np\n", |
|
|
|
"from sklearn.feature_extraction.text import TfidfVectorizer\n", |
|
|
|
"from sklearn.linear_model.logistic import LogisticRegression\n", |
|
|
|
"from sklearn.model_selection import train_test_split,cross_val_score\n", |
|
|
|
"df = pd.read_csv('SMSSpamCollection.csv',header=None,encoding='utf-8')\n", |
|
|
|
"print(df.head)\n", |
|
|
|
"print(\"垃圾邮件个数:%s\" % df[df[0]=='spam'][0].count())\n", |
|
|
|
"print(\"正常邮件个数:%s\" % df[df[0]=='ham'][0].count())" |
|
|
|
] |
|
|
|
}, |
|
|
|
{ |
|
|
|
"cell_type": "code", |
|
|
|
"execution_count": 2, |
|
|
|
"metadata": {}, |
|
|
|
"outputs": [], |
|
|
|
"source": [ |
|
|
|
"X = df[1].values\n", |
|
|
|
"y = df[0].values\n", |
|
|
|
"X_train_raw,X_test_raw,y_train,y_test=train_test_split(X,y)\n", |
|
|
|
"vectorizer = TfidfVectorizer()\n", |
|
|
|
"X_train = vectorizer.fit_transform(X_train_raw)\n", |
|
|
|
"X_test = vectorizer.transform(X_test_raw)" |
|
|
|
] |
|
|
|
}, |
|
|
|
{ |
|
|
|
"cell_type": "code", |
|
|
|
"execution_count": 3, |
|
|
|
"metadata": {}, |
|
|
|
"outputs": [ |
|
|
|
{ |
|
|
|
"name": "stdout", |
|
|
|
"output_type": "stream", |
|
|
|
"text": [ |
|
|
|
"预测为 ham ,信件为 Sunshine Quiz Wkly Q! Win a top Sony DVD player if u know which country the Algarve is in? Txt ansr to 82277. 1.50 SP:Tyrone\n", |
|
|
|
"预测为 ham ,信件为 As per your request 'Melle Melle (Oru Minnaminunginte Nurungu Vettam)' has been set as your callertune for all Callers. Press *9 to copy your friends Callertune\n", |
|
|
|
"预测为 ham ,信件为 GENT! We are trying to contact you. Last weekends draw shows that you won a 1000 prize GUARANTEED. Call 09064012160. Claim Code K52. Valid 12hrs only. 150ppm\n", |
|
|
|
"预测为 ham ,信件为 Customer service annoncement. You have a New Years delivery waiting for you. Please call 07046744435 now to arrange delivery\n", |
|
|
|
"预测为 ham ,信件为 Yes..gauti and sehwag out of odi series.\n" |
|
|
|
] |
|
|
|
} |
|
|
|
], |
|
|
|
"source": [ |
|
|
|
"LR = LogisticRegression()\n", |
|
|
|
"LR.fit(X_train,y_train)\n", |
|
|
|
"predictions = LR.predict(X_test)\n", |
|
|
|
"for i,prediction in enumerate(predictions[:5]):\n", |
|
|
|
" print(\"预测为 %s ,信件为 %s\" % (prediction,X_test_raw[i]))" |
|
|
|
] |
|
|
|
}, |
|
|
|
{ |
|
|
|
"cell_type": "code", |
|
|
|
"execution_count": 4, |
|
|
|
"metadata": {}, |
|
|
|
"outputs": [ |
|
|
|
{ |
|
|
|
"name": "stdout", |
|
|
|
"output_type": "stream", |
|
|
|
"text": [ |
|
|
|
" precision recall f1-score support\n", |
|
|
|
"\n", |
|
|
|
" ham 0.50 1.00 0.67 4\n", |
|
|
|
" spam 0.00 0.00 0.00 4\n", |
|
|
|
"\n", |
|
|
|
"avg / total 0.25 0.50 0.33 8\n", |
|
|
|
"\n", |
|
|
|
"准确率为: [0.8 0.8 0.75 0.75 0.75]\n", |
|
|
|
"平均准确率为: 0.77\n", |
|
|
|
"平均精准率为: 0.0\n", |
|
|
|
"平均召回率为: 0.0\n", |
|
|
|
"平均F1值为: 0.0\n" |
|
|
|
] |
|
|
|
}, |
|
|
|
{ |
|
|
|
"name": "stderr", |
|
|
|
"output_type": "stream", |
|
|
|
"text": [ |
|
|
|
"C:\\Users\\52257\\Anaconda3\\lib\\site-packages\\sklearn\\metrics\\classification.py:1135: UndefinedMetricWarning: Precision and F-score are ill-defined and being set to 0.0 in labels with no predicted samples.\n", |
|
|
|
" 'precision', 'predicted', average, warn_for)\n", |
|
|
|
"C:\\Users\\52257\\Anaconda3\\lib\\site-packages\\sklearn\\metrics\\classification.py:1135: UndefinedMetricWarning: Precision is ill-defined and being set to 0.0 due to no predicted samples.\n", |
|
|
|
" 'precision', 'predicted', average, warn_for)\n", |
|
|
|
"C:\\Users\\52257\\Anaconda3\\lib\\site-packages\\sklearn\\metrics\\classification.py:1135: UndefinedMetricWarning: Precision is ill-defined and being set to 0.0 due to no predicted samples.\n", |
|
|
|
" 'precision', 'predicted', average, warn_for)\n", |
|
|
|
"C:\\Users\\52257\\Anaconda3\\lib\\site-packages\\sklearn\\metrics\\classification.py:1135: UndefinedMetricWarning: Precision is ill-defined and being set to 0.0 due to no predicted samples.\n", |
|
|
|
" 'precision', 'predicted', average, warn_for)\n", |
|
|
|
"C:\\Users\\52257\\Anaconda3\\lib\\site-packages\\sklearn\\metrics\\classification.py:1135: UndefinedMetricWarning: Precision is ill-defined and being set to 0.0 due to no predicted samples.\n", |
|
|
|
" 'precision', 'predicted', average, warn_for)\n", |
|
|
|
"C:\\Users\\52257\\Anaconda3\\lib\\site-packages\\sklearn\\metrics\\classification.py:1135: UndefinedMetricWarning: Precision is ill-defined and being set to 0.0 due to no predicted samples.\n", |
|
|
|
" 'precision', 'predicted', average, warn_for)\n", |
|
|
|
"C:\\Users\\52257\\Anaconda3\\lib\\site-packages\\sklearn\\metrics\\classification.py:1135: UndefinedMetricWarning: F-score is ill-defined and being set to 0.0 due to no predicted samples.\n", |
|
|
|
" 'precision', 'predicted', average, warn_for)\n", |
|
|
|
"C:\\Users\\52257\\Anaconda3\\lib\\site-packages\\sklearn\\metrics\\classification.py:1135: UndefinedMetricWarning: F-score is ill-defined and being set to 0.0 due to no predicted samples.\n", |
|
|
|
" 'precision', 'predicted', average, warn_for)\n", |
|
|
|
"C:\\Users\\52257\\Anaconda3\\lib\\site-packages\\sklearn\\metrics\\classification.py:1135: UndefinedMetricWarning: F-score is ill-defined and being set to 0.0 due to no predicted samples.\n", |
|
|
|
" 'precision', 'predicted', average, warn_for)\n", |
|
|
|
"C:\\Users\\52257\\Anaconda3\\lib\\site-packages\\sklearn\\metrics\\classification.py:1135: UndefinedMetricWarning: F-score is ill-defined and being set to 0.0 due to no predicted samples.\n", |
|
|
|
" 'precision', 'predicted', average, warn_for)\n", |
|
|
|
"C:\\Users\\52257\\Anaconda3\\lib\\site-packages\\sklearn\\metrics\\classification.py:1135: UndefinedMetricWarning: F-score is ill-defined and being set to 0.0 due to no predicted samples.\n", |
|
|
|
" 'precision', 'predicted', average, warn_for)\n" |
|
|
|
] |
|
|
|
} |
|
|
|
], |
|
|
|
"source": [ |
|
|
|
"from sklearn.metrics import classification_report\n", |
|
|
|
"print(classification_report(y_test,predictions))\n", |
|
|
|
"\n", |
|
|
|
"from sklearn.metrics import roc_curve,auc\n", |
|
|
|
"# 准确率\n", |
|
|
|
"scores = cross_val_score(LR,X_train,y_train,cv=5)\n", |
|
|
|
"print(\"准确率为: \",scores)\n", |
|
|
|
"print(\"平均准确率为: \",np.mean(scores))\n", |
|
|
|
"\n", |
|
|
|
"# 有时必须要将标签转为数值\n", |
|
|
|
"from sklearn.preprocessing import LabelEncoder\n", |
|
|
|
"class_le = LabelEncoder()\n", |
|
|
|
"y_train_n = class_le.fit_transform(y_train)\n", |
|
|
|
"y_test_n = class_le.fit_transform(y_test)\n", |
|
|
|
"\n", |
|
|
|
"# 精准率\n", |
|
|
|
"precision = cross_val_score(LR,X_train,y_train_n,cv=5,scoring='precision')\n", |
|
|
|
"print(\"平均精准率为: \",np.mean(precision))\n", |
|
|
|
"# 召回率\n", |
|
|
|
"recall = cross_val_score(LR,X_train,y_train_n,cv=5,scoring='recall')\n", |
|
|
|
"print(\"平均召回率为: \",np.mean(recall)) \n", |
|
|
|
"# F1值\n", |
|
|
|
"f1 = cross_val_score(LR,X_train,y_train_n,cv=5,scoring='f1')\n", |
|
|
|
"print(\"平均F1值为: \",np.mean(f1)) " |
|
|
|
] |
|
|
|
}, |
|
|
|
{ |
|
|
|
"cell_type": "markdown", |
|
|
|
"metadata": {}, |
|
|
|
"source": [ |
|
|
|
"#### 警报通常是由于系统对某种标签没有找到。尝试将数据集中加入更多的数据,再次运行,查看结果。" |
|
|
|
] |
|
|
|
}, |
|
|
|
{ |
|
|
|
"cell_type": "code", |
|
|
|
"execution_count": null, |
|
|
|
"metadata": {}, |
|
|
|
"outputs": [], |
|
|
|
"source": [] |
|
|
|
} |
|
|
|
], |
|
|
|
"metadata": { |
|
|
|
"kernelspec": { |
|
|
|
"display_name": "Python 3", |
|
|
|
"language": "python", |
|
|
|
"name": "python3" |
|
|
|
}, |
|
|
|
"language_info": { |
|
|
|
"codemirror_mode": { |
|
|
|
"name": "ipython", |
|
|
|
"version": 3 |
|
|
|
}, |
|
|
|
"file_extension": ".py", |
|
|
|
"mimetype": "text/x-python", |
|
|
|
"name": "python", |
|
|
|
"nbconvert_exporter": "python", |
|
|
|
"pygments_lexer": "ipython3", |
|
|
|
"version": "3.6.5" |
|
|
|
} |
|
|
|
}, |
|
|
|
"nbformat": 4, |
|
|
|
"nbformat_minor": 2 |
|
|
|
} |