The final project of DASE Introduction, the analysis of COVID-19
 
 

96 lines
3.6 KiB

import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import statsmodels.api as sm
from statsmodels.tsa.stattools import adfuller
from statsmodels.tsa.arima_model import ARIMA
from pmdarima import auto_arima
#打开数据文件
dataset = pd.read_csv('E:\DaseIntro\COVID-19Analysis\COVID-19\covid-19-all.csv')
#数据预处理
def parse_ymd(s):
year_s, mon_s, day_s = s.split('-')
return datetime.datetime(int(year_s), int(mon_s), int(day_s)).strftime("%Y-%m-%d")
dataset = dataset.fillna(0)
dataset['Date'] = pd.to_datetime(dataset['Date'])
dataset = dataset[['Country/Region','Confirmed','Recovered','Deaths','Date']].groupby(['Country/Region','Date']).sum().reset_index()
CN = dataset[dataset['Country/Region'] == 'China']
CN.index = pd.Index(pd.date_range('2020-01-22','2020-12-09',freq = '1D'))
#差分后可视化
fig1 = plt.figure()
axcon = fig1.add_subplot(221)
axcon.set_title("Confirmed")
confirmedSeries = pd.DataFrame(CN['Confirmed'])
confirmedSeries = confirmedSeries.fillna(0)
confirmedSeries['Confirmed'] = confirmedSeries['Confirmed'] - confirmedSeries['Confirmed'].shift(1)
axcon.plot(confirmedSeries)
axrec = fig1.add_subplot(222)
axrec.set_title("Recovered")
recoveredSeries = pd.DataFrame(CN['Recovered'])
recoveredSeries = recoveredSeries.fillna(0)
recoveredSeries['Recovered'] = recoveredSeries['Recovered'] - recoveredSeries['Recovered'].shift(1)
axrec.plot(recoveredSeries)
axdea = fig1.add_subplot(223)
axdea.set_title("Deaths")
deathsSeries = pd.DataFrame(CN['Deaths'])
deathsSeries = deathsSeries.fillna(0)
deathsSeries['Deaths'] = deathsSeries['Deaths'] - deathsSeries['Deaths'].shift(1)
axdea.plot(deathsSeries)
plt.show()
#ADF检验
print(sm.tsa.stattools.adfuller(confirmedSeries.iloc[1:]))
print(sm.tsa.stattools.adfuller(recoveredSeries.iloc[1:]))
print(sm.tsa.stattools.adfuller(deathsSeries.iloc[1:]))
#ARIMA模型
modelConfirmed = sm.tsa.ARIMA(confirmedSeries.iloc[1:],(1,0,2)).fit()
confirmed_pre=modelConfirmed.predict(start="2020-01-23", end="2020-12-31", dynamic = False)
modelRecovered = sm.tsa.ARIMA(recoveredSeries.iloc[1:],(1,0,2)).fit()
recovered_pre=modelRecovered.predict(start="2020-01-23", end="2020-12-31", dynamic = False)
modelDeaths = sm.tsa.ARIMA(deathsSeries.iloc[1:],(1,0,1)).fit()
deaths_pre=modelDeaths.predict(start="2020-01-23", end="2020-12-31", dynamic = False)
#逆差分还原
temp=np.array(CN['Confirmed'])
for i in range(len(temp)):
confirmed_pre[i]+=temp[i]
for i in range(len(temp),confirmed_pre.shape[0]):
confirmed_pre[i]+=confirmed_pre[i-1]
confirmed_pre=pd.DataFrame({'confirmed_pre':confirmed_pre})
confirmed_pre.index = pd.Index(pd.date_range('2020-01-23','2020-12-31',freq = '1D'))
temp=np.array(CN['Recovered'])
for i in range(len(temp)):
recovered_pre[i]+=temp[i]
for i in range(len(temp),confirmed_pre.shape[0]):
recovered_pre[i]+=recovered_pre[i-1]
recovered_pre=pd.DataFrame({'recovered_pre':recovered_pre})
recovered_pre.index = pd.Index(pd.date_range('2020-01-23','2020-12-31',freq = '1D'))
temp=np.array(CN['Deaths'])
for i in range(len(temp)):
deaths_pre[i]+=temp[i]
for i in range(len(temp),deaths_pre.shape[0]):
deaths_pre[i]+=deaths_pre[i-1]
deaths_pre=pd.DataFrame({'deaths_pre':deaths_pre})
deaths_pre.index = pd.Index(pd.date_range('2020-01-23','2020-12-31',freq = '1D'))
#可视化
fig2 = plt.figure()
axcon = fig2.add_subplot(311)
axcon.plot(CN['Confirmed'],label="confirmed")
axcon.plot(confirmed_pre,label="ARIMA")
axrec = fig2.add_subplot(312)
axrec.plot(CN['Recovered'],label="recovered")
axrec.plot(recovered_pre,label="ARIMA")
axdea = fig2.add_subplot(313)
axdea.plot(CN['Deaths'],label="deaths")
axdea.plot(deaths_pre,label="ARIMA")
plt.legend()
plt.show()