51194507011_stu_ecnu_edu_cn
/
Intel_AI


								{

								 "cells": [

								  {

								   "cell_type": "code",

								   "execution_count": 1,

								   "metadata": {},

								   "outputs": [

								    {

								     "name": "stdout",

								     "output_type": "stream",

								     "text": [

								      "<bound method NDFrame.head of        0                                                  1\n",

								      "0    ham  Go until jurong point, crazy.. Available only ...\n",

								      "1    ham                      Ok lar... Joking wif u oni...\n",

								      "2    ham  Even my brother is not like to speak with me. ...\n",

								      "3    ham  As per your request 'Melle Melle (Oru Minnamin...\n",

								      "4   spam  England v Macedonia - dont miss the goalsteam ...\n",

								      "5   spam  Congrats! 1 year special cinema pass for 2 is ...\n",

								      "6    ham                 Sorry, I'll call later in meeting.\n",

								      "7    ham                             Tell where you reached\n",

								      "8    ham           Yes..gauti and sehwag out of odi series.\n",

								      "9   spam  Please call our customer service representativ...\n",

								      "10   ham  Havent planning to buy later. I check already ...\n",

								      "11  spam  Your free ringtone is waiting to be collected....\n",

								      "12   ham  How would my ip address test that considering ...\n",

								      "13   ham  I know! Grumpy old people. My mom was like you...\n",

								      "14   ham                     Dont worry. I guess he's busy.\n",

								      "15   ham           What is the plural of the noun research?\n",

								      "16   ham                    Going for dinner.msg you after.\n",

								      "17   ham  I'm ok wif it cos i like 2 try new things. But...\n",

								      "18  spam  GENT! We are trying to contact you. Last weeke...\n",

								      "19   ham                       K..k:)how much does it cost?\n",

								      "20   ham                                          I'm home.\n",

								      "21   ham            Dear, will call Tmorrow.pls accomodate.\n",

								      "22   ham                          First answer my question.\n",

								      "23  spam  Sunshine Quiz Wkly Q! Win a top Sony DVD playe...\n",

								      "24   ham                  K, text me when you're on the way\n",

								      "25   ham                        Sir, Waiting for your mail.\n",

								      "26  spam  Customer service annoncement. You have a New Y...\n",

								      "27  spam  You are a winner U have been specially selecte...\n",

								      "28   ham  Keep yourself safe for me because I need you a...\n",

								      "29  spam  URGENT! We are trying to contact you. Last wee...>\n",

								      "垃圾邮件个数：9\n",

								      "正常邮件个数：21\n"

								     ]

								    }

								   ],

								   "source": [

								    "import pandas as pd\n",

								    "import numpy as np\n",

								    "from sklearn.feature_extraction.text import TfidfVectorizer\n",

								    "from sklearn.linear_model.logistic import LogisticRegression\n",

								    "from sklearn.model_selection import train_test_split,cross_val_score\n",

								    "df = pd.read_csv('SMSSpamCollection.csv',header=None,encoding='utf-8')\n",

								    "print(df.head)\n",

								    "print(\"垃圾邮件个数：%s\" % df[df[0]=='spam'][0].count())\n",

								    "print(\"正常邮件个数：%s\" % df[df[0]=='ham'][0].count())"

								   ]

								  },

								  {

								   "cell_type": "code",

								   "execution_count": 2,

								   "metadata": {},

								   "outputs": [],

								   "source": [

								    "X = df[1].values\n",

								    "y = df[0].values\n",

								    "X_train_raw,X_test_raw,y_train,y_test=train_test_split(X,y)\n",

								    "vectorizer = TfidfVectorizer()\n",

								    "X_train = vectorizer.fit_transform(X_train_raw)\n",

								    "X_test = vectorizer.transform(X_test_raw)"

								   ]

								  },

								  {

								   "cell_type": "code",

								   "execution_count": 3,

								   "metadata": {},

								   "outputs": [

								    {

								     "name": "stdout",

								     "output_type": "stream",

								     "text": [

								      "预测为 ham ,信件为 Sunshine Quiz Wkly Q! Win a top Sony DVD player if u know which country the Algarve is in? Txt ansr to 82277. 1.50 SP:Tyrone\n",

								      "预测为 ham ,信件为 As per your request 'Melle Melle (Oru Minnaminunginte Nurungu Vettam)' has been set as your callertune for all Callers. Press *9 to copy your friends Callertune\n",

								      "预测为 ham ,信件为 GENT! We are trying to contact you. Last weekends draw shows that you won a 1000 prize GUARANTEED. Call 09064012160. Claim Code K52. Valid 12hrs only. 150ppm\n",

								      "预测为 ham ,信件为 Customer service annoncement. You have a New Years delivery waiting for you. Please call 07046744435 now to arrange delivery\n",

								      "预测为 ham ,信件为 Yes..gauti and sehwag out of odi series.\n"

								     ]

								    }

								   ],

								   "source": [

								    "LR = LogisticRegression()\n",

								    "LR.fit(X_train,y_train)\n",

								    "predictions = LR.predict(X_test)\n",

								    "for i,prediction in enumerate(predictions[:5]):\n",

								    "    print(\"预测为 %s ,信件为 %s\" % (prediction,X_test_raw[i]))"

								   ]

								  },

								  {

								   "cell_type": "code",

								   "execution_count": 4,

								   "metadata": {},

								   "outputs": [

								    {

								     "name": "stdout",

								     "output_type": "stream",

								     "text": [

								      "             precision    recall  f1-score   support\n",

								      "\n",

								      "        ham       0.50      1.00      0.67         4\n",

								      "       spam       0.00      0.00      0.00         4\n",

								      "\n",

								      "avg / total       0.25      0.50      0.33         8\n",

								      "\n",

								      "准确率为:  [0.8  0.8  0.75 0.75 0.75]\n",

								      "平均准确率为:  0.77\n",

								      "平均精准率为:  0.0\n",

								      "平均召回率为:  0.0\n",

								      "平均F1值为:  0.0\n"

								     ]

								    },

								    {

								     "name": "stderr",

								     "output_type": "stream",

								     "text": [

								      "C:\\Users\\52257\\Anaconda3\\lib\\site-packages\\sklearn\\metrics\\classification.py:1135: UndefinedMetricWarning: Precision and F-score are ill-defined and being set to 0.0 in labels with no predicted samples.\n",

								      "  'precision', 'predicted', average, warn_for)\n",

								      "C:\\Users\\52257\\Anaconda3\\lib\\site-packages\\sklearn\\metrics\\classification.py:1135: UndefinedMetricWarning: Precision is ill-defined and being set to 0.0 due to no predicted samples.\n",

								      "  'precision', 'predicted', average, warn_for)\n",

								      "C:\\Users\\52257\\Anaconda3\\lib\\site-packages\\sklearn\\metrics\\classification.py:1135: UndefinedMetricWarning: Precision is ill-defined and being set to 0.0 due to no predicted samples.\n",

								      "  'precision', 'predicted', average, warn_for)\n",

								      "C:\\Users\\52257\\Anaconda3\\lib\\site-packages\\sklearn\\metrics\\classification.py:1135: UndefinedMetricWarning: Precision is ill-defined and being set to 0.0 due to no predicted samples.\n",

								      "  'precision', 'predicted', average, warn_for)\n",

								      "C:\\Users\\52257\\Anaconda3\\lib\\site-packages\\sklearn\\metrics\\classification.py:1135: UndefinedMetricWarning: Precision is ill-defined and being set to 0.0 due to no predicted samples.\n",

								      "  'precision', 'predicted', average, warn_for)\n",

								      "C:\\Users\\52257\\Anaconda3\\lib\\site-packages\\sklearn\\metrics\\classification.py:1135: UndefinedMetricWarning: Precision is ill-defined and being set to 0.0 due to no predicted samples.\n",

								      "  'precision', 'predicted', average, warn_for)\n",

								      "C:\\Users\\52257\\Anaconda3\\lib\\site-packages\\sklearn\\metrics\\classification.py:1135: UndefinedMetricWarning: F-score is ill-defined and being set to 0.0 due to no predicted samples.\n",

								      "  'precision', 'predicted', average, warn_for)\n",

								      "C:\\Users\\52257\\Anaconda3\\lib\\site-packages\\sklearn\\metrics\\classification.py:1135: UndefinedMetricWarning: F-score is ill-defined and being set to 0.0 due to no predicted samples.\n",

								      "  'precision', 'predicted', average, warn_for)\n",

								      "C:\\Users\\52257\\Anaconda3\\lib\\site-packages\\sklearn\\metrics\\classification.py:1135: UndefinedMetricWarning: F-score is ill-defined and being set to 0.0 due to no predicted samples.\n",

								      "  'precision', 'predicted', average, warn_for)\n",

								      "C:\\Users\\52257\\Anaconda3\\lib\\site-packages\\sklearn\\metrics\\classification.py:1135: UndefinedMetricWarning: F-score is ill-defined and being set to 0.0 due to no predicted samples.\n",

								      "  'precision', 'predicted', average, warn_for)\n",

								      "C:\\Users\\52257\\Anaconda3\\lib\\site-packages\\sklearn\\metrics\\classification.py:1135: UndefinedMetricWarning: F-score is ill-defined and being set to 0.0 due to no predicted samples.\n",

								      "  'precision', 'predicted', average, warn_for)\n"

								     ]

								    }

								   ],

								   "source": [

								    "from sklearn.metrics import classification_report\n",

								    "print(classification_report(y_test,predictions))\n",

								    "\n",

								    "from sklearn.metrics import roc_curve,auc\n",

								    "# 准确率\n",

								    "scores =  cross_val_score(LR,X_train,y_train,cv=5)\n",

								    "print(\"准确率为: \",scores)\n",

								    "print(\"平均准确率为: \",np.mean(scores))\n",

								    "\n",

								    "# 有时必须要将标签转为数值\n",

								    "from sklearn.preprocessing import LabelEncoder\n",

								    "class_le = LabelEncoder()\n",

								    "y_train_n = class_le.fit_transform(y_train)\n",

								    "y_test_n = class_le.fit_transform(y_test)\n",

								    "\n",

								    "# 精准率\n",

								    "precision =  cross_val_score(LR,X_train,y_train_n,cv=5,scoring='precision')\n",

								    "print(\"平均精准率为: \",np.mean(precision))\n",

								    "# 召回率\n",

								    "recall =  cross_val_score(LR,X_train,y_train_n,cv=5,scoring='recall')\n",

								    "print(\"平均召回率为: \",np.mean(recall))   \n",

								    "# F1值\n",

								    "f1 =  cross_val_score(LR,X_train,y_train_n,cv=5,scoring='f1')\n",

								    "print(\"平均F1值为: \",np.mean(f1))  "

								   ]

								  },

								  {

								   "cell_type": "markdown",

								   "metadata": {},

								   "source": [

								    "#### 警报通常是由于系统对某种标签没有找到。尝试将数据集中加入更多的数据，再次运行，查看结果。"

								   ]

								  },

								  {

								   "cell_type": "code",

								   "execution_count": null,

								   "metadata": {},

								   "outputs": [],

								   "source": []

								  }

								 ],

								 "metadata": {

								  "kernelspec": {

								   "display_name": "Python 3",

								   "language": "python",

								   "name": "python3"

								  },

								  "language_info": {

								   "codemirror_mode": {

								    "name": "ipython",

								    "version": 3

								   },

								   "file_extension": ".py",

								   "mimetype": "text/x-python",

								   "name": "python",

								   "nbconvert_exporter": "python",

								   "pygments_lexer": "ipython3",

								   "version": "3.6.5"

								  }

								 },

								 "nbformat": 4,

								 "nbformat_minor": 2

								}