You can not select more than 25 topics Topics must start with a letter or number, can include dashes ('-') and can be up to 35 characters long.
 

214 lines
9.9 KiB

{
"cells": [
{
"cell_type": "code",
"execution_count": 1,
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"<bound method NDFrame.head of 0 1\n",
"0 ham Go until jurong point, crazy.. Available only ...\n",
"1 ham Ok lar... Joking wif u oni...\n",
"2 ham Even my brother is not like to speak with me. ...\n",
"3 ham As per your request 'Melle Melle (Oru Minnamin...\n",
"4 spam England v Macedonia - dont miss the goalsteam ...\n",
"5 spam Congrats! 1 year special cinema pass for 2 is ...\n",
"6 ham Sorry, I'll call later in meeting.\n",
"7 ham Tell where you reached\n",
"8 ham Yes..gauti and sehwag out of odi series.\n",
"9 spam Please call our customer service representativ...\n",
"10 ham Havent planning to buy later. I check already ...\n",
"11 spam Your free ringtone is waiting to be collected....\n",
"12 ham How would my ip address test that considering ...\n",
"13 ham I know! Grumpy old people. My mom was like you...\n",
"14 ham Dont worry. I guess he's busy.\n",
"15 ham What is the plural of the noun research?\n",
"16 ham Going for dinner.msg you after.\n",
"17 ham I'm ok wif it cos i like 2 try new things. But...\n",
"18 spam GENT! We are trying to contact you. Last weeke...\n",
"19 ham K..k:)how much does it cost?\n",
"20 ham I'm home.\n",
"21 ham Dear, will call Tmorrow.pls accomodate.\n",
"22 ham First answer my question.\n",
"23 spam Sunshine Quiz Wkly Q! Win a top Sony DVD playe...\n",
"24 ham K, text me when you're on the way\n",
"25 ham Sir, Waiting for your mail.\n",
"26 spam Customer service annoncement. You have a New Y...\n",
"27 spam You are a winner U have been specially selecte...\n",
"28 ham Keep yourself safe for me because I need you a...\n",
"29 spam URGENT! We are trying to contact you. Last wee...>\n",
"垃圾邮件个数:9\n",
"正常邮件个数:21\n"
]
}
],
"source": [
"import pandas as pd\n",
"import numpy as np\n",
"from sklearn.feature_extraction.text import TfidfVectorizer\n",
"from sklearn.linear_model.logistic import LogisticRegression\n",
"from sklearn.model_selection import train_test_split,cross_val_score\n",
"df = pd.read_csv('SMSSpamCollection.csv',header=None,encoding='utf-8')\n",
"print(df.head)\n",
"print(\"垃圾邮件个数:%s\" % df[df[0]=='spam'][0].count())\n",
"print(\"正常邮件个数:%s\" % df[df[0]=='ham'][0].count())"
]
},
{
"cell_type": "code",
"execution_count": 2,
"metadata": {},
"outputs": [],
"source": [
"X = df[1].values\n",
"y = df[0].values\n",
"X_train_raw,X_test_raw,y_train,y_test=train_test_split(X,y)\n",
"vectorizer = TfidfVectorizer()\n",
"X_train = vectorizer.fit_transform(X_train_raw)\n",
"X_test = vectorizer.transform(X_test_raw)"
]
},
{
"cell_type": "code",
"execution_count": 3,
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"预测为 ham ,信件为 Sunshine Quiz Wkly Q! Win a top Sony DVD player if u know which country the Algarve is in? Txt ansr to 82277. 1.50 SP:Tyrone\n",
"预测为 ham ,信件为 As per your request 'Melle Melle (Oru Minnaminunginte Nurungu Vettam)' has been set as your callertune for all Callers. Press *9 to copy your friends Callertune\n",
"预测为 ham ,信件为 GENT! We are trying to contact you. Last weekends draw shows that you won a 1000 prize GUARANTEED. Call 09064012160. Claim Code K52. Valid 12hrs only. 150ppm\n",
"预测为 ham ,信件为 Customer service annoncement. You have a New Years delivery waiting for you. Please call 07046744435 now to arrange delivery\n",
"预测为 ham ,信件为 Yes..gauti and sehwag out of odi series.\n"
]
}
],
"source": [
"LR = LogisticRegression()\n",
"LR.fit(X_train,y_train)\n",
"predictions = LR.predict(X_test)\n",
"for i,prediction in enumerate(predictions[:5]):\n",
" print(\"预测为 %s ,信件为 %s\" % (prediction,X_test_raw[i]))"
]
},
{
"cell_type": "code",
"execution_count": 4,
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
" precision recall f1-score support\n",
"\n",
" ham 0.50 1.00 0.67 4\n",
" spam 0.00 0.00 0.00 4\n",
"\n",
"avg / total 0.25 0.50 0.33 8\n",
"\n",
"准确率为: [0.8 0.8 0.75 0.75 0.75]\n",
"平均准确率为: 0.77\n",
"平均精准率为: 0.0\n",
"平均召回率为: 0.0\n",
"平均F1值为: 0.0\n"
]
},
{
"name": "stderr",
"output_type": "stream",
"text": [
"C:\\Users\\52257\\Anaconda3\\lib\\site-packages\\sklearn\\metrics\\classification.py:1135: UndefinedMetricWarning: Precision and F-score are ill-defined and being set to 0.0 in labels with no predicted samples.\n",
" 'precision', 'predicted', average, warn_for)\n",
"C:\\Users\\52257\\Anaconda3\\lib\\site-packages\\sklearn\\metrics\\classification.py:1135: UndefinedMetricWarning: Precision is ill-defined and being set to 0.0 due to no predicted samples.\n",
" 'precision', 'predicted', average, warn_for)\n",
"C:\\Users\\52257\\Anaconda3\\lib\\site-packages\\sklearn\\metrics\\classification.py:1135: UndefinedMetricWarning: Precision is ill-defined and being set to 0.0 due to no predicted samples.\n",
" 'precision', 'predicted', average, warn_for)\n",
"C:\\Users\\52257\\Anaconda3\\lib\\site-packages\\sklearn\\metrics\\classification.py:1135: UndefinedMetricWarning: Precision is ill-defined and being set to 0.0 due to no predicted samples.\n",
" 'precision', 'predicted', average, warn_for)\n",
"C:\\Users\\52257\\Anaconda3\\lib\\site-packages\\sklearn\\metrics\\classification.py:1135: UndefinedMetricWarning: Precision is ill-defined and being set to 0.0 due to no predicted samples.\n",
" 'precision', 'predicted', average, warn_for)\n",
"C:\\Users\\52257\\Anaconda3\\lib\\site-packages\\sklearn\\metrics\\classification.py:1135: UndefinedMetricWarning: Precision is ill-defined and being set to 0.0 due to no predicted samples.\n",
" 'precision', 'predicted', average, warn_for)\n",
"C:\\Users\\52257\\Anaconda3\\lib\\site-packages\\sklearn\\metrics\\classification.py:1135: UndefinedMetricWarning: F-score is ill-defined and being set to 0.0 due to no predicted samples.\n",
" 'precision', 'predicted', average, warn_for)\n",
"C:\\Users\\52257\\Anaconda3\\lib\\site-packages\\sklearn\\metrics\\classification.py:1135: UndefinedMetricWarning: F-score is ill-defined and being set to 0.0 due to no predicted samples.\n",
" 'precision', 'predicted', average, warn_for)\n",
"C:\\Users\\52257\\Anaconda3\\lib\\site-packages\\sklearn\\metrics\\classification.py:1135: UndefinedMetricWarning: F-score is ill-defined and being set to 0.0 due to no predicted samples.\n",
" 'precision', 'predicted', average, warn_for)\n",
"C:\\Users\\52257\\Anaconda3\\lib\\site-packages\\sklearn\\metrics\\classification.py:1135: UndefinedMetricWarning: F-score is ill-defined and being set to 0.0 due to no predicted samples.\n",
" 'precision', 'predicted', average, warn_for)\n",
"C:\\Users\\52257\\Anaconda3\\lib\\site-packages\\sklearn\\metrics\\classification.py:1135: UndefinedMetricWarning: F-score is ill-defined and being set to 0.0 due to no predicted samples.\n",
" 'precision', 'predicted', average, warn_for)\n"
]
}
],
"source": [
"from sklearn.metrics import classification_report\n",
"print(classification_report(y_test,predictions))\n",
"\n",
"from sklearn.metrics import roc_curve,auc\n",
"# 准确率\n",
"scores = cross_val_score(LR,X_train,y_train,cv=5)\n",
"print(\"准确率为: \",scores)\n",
"print(\"平均准确率为: \",np.mean(scores))\n",
"\n",
"# 有时必须要将标签转为数值\n",
"from sklearn.preprocessing import LabelEncoder\n",
"class_le = LabelEncoder()\n",
"y_train_n = class_le.fit_transform(y_train)\n",
"y_test_n = class_le.fit_transform(y_test)\n",
"\n",
"# 精准率\n",
"precision = cross_val_score(LR,X_train,y_train_n,cv=5,scoring='precision')\n",
"print(\"平均精准率为: \",np.mean(precision))\n",
"# 召回率\n",
"recall = cross_val_score(LR,X_train,y_train_n,cv=5,scoring='recall')\n",
"print(\"平均召回率为: \",np.mean(recall)) \n",
"# F1值\n",
"f1 = cross_val_score(LR,X_train,y_train_n,cv=5,scoring='f1')\n",
"print(\"平均F1值为: \",np.mean(f1)) "
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"#### 警报通常是由于系统对某种标签没有找到。尝试将数据集中加入更多的数据,再次运行,查看结果。"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": []
}
],
"metadata": {
"kernelspec": {
"display_name": "Python 3",
"language": "python",
"name": "python3"
},
"language_info": {
"codemirror_mode": {
"name": "ipython",
"version": 3
},
"file_extension": ".py",
"mimetype": "text/x-python",
"name": "python",
"nbconvert_exporter": "python",
"pygments_lexer": "ipython3",
"version": "3.6.5"
}
},
"nbformat": 4,
"nbformat_minor": 2
}