빅데이터 | 머신러닝 | 딥러닝/빅데이터 분석
[pandas] 코로나 수치 예측하기 (feat. Linear Regression)
냠냠:)
2020. 5. 3. 02:03
- isna() : 사용할 수 없는 값 (null)
- notna() : 사용할 수 있는 값 (not null)
- astype() : 데이터 타입 변환 (ex Int65)
- loc[] : 행/열 인덱스 접근
- np.histogram : 도수 분포표를 그릴 수 있게 해주는 함수 bins=나눌 구간 수, count와 나눈 구간들을 반환한다.
count는 나눈 구간의 전체의 비율(?)을 반환한다. - reshape(-1,1) : range를 1열로 만드는 함수, 즉 -1은 열을 1개로 두었을 때 나올 행 수를 추정한다.
In [1]:
#importing the required packages and csv file
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt
import warnings
import requests
from io import StringIO
warnings.filterwarnings('ignore')
orig_url='https://drive.google.com/file/d/1zvvT5vp9xiKJDXacnwvP0zQ1y1G1C6tz/view?usp=sharing'
file_id = orig_url.split('/')[-2]
dwn_url = 'https://drive.google.com/uc?export=download&id='+file_id
url = requests.get(dwn_url).text
csv_raw = StringIO(url)
patient = pd.read_csv(csv_raw)
patient.head(5)
Out[1]:
In [2]:
patient.info()
In [3]:
patient['birth_year'].isna().sum() #not available
Out[3]:
In [4]:
patient['birth_year'].notna().sum() + patient['birth_year'].isna().sum()
Out[4]:
In [5]:
patient['birth_year'].astype('Int64')
Out[5]:
In [6]:
# Data pre-processing
patient['age'] = 2020 - patient['birth_year']
In [7]:
deceased = patient.loc[patient['state']=='deceased']
released = patient.loc[patient['state']=='released']
isolated = patient.loc[patient['state']=='isolated']
In [8]:
pd.to_datetime(deceased['confirmed_date'])
Out[8]:
In [9]:
#Adding one more feature
date_column = ['confirmed_date','deceased_date'] #datetime형으로 캐스팅
for i in date_column:
deceased[i] = pd.to_datetime(deceased[i])
deceased['no_of_days_survived']= deceased['deceased_date'] - deceased['confirmed_date'] #확정되고 죽은날짜까지 걸린시간
deceased.head(5)
Out[9]:
In [10]:
date_column = ['confirmed_date','released_date']
for i in date_column:
released[i] = pd.to_datetime(released[i])
released['no_of_days_treated'] = released['released_date'] - released['confirmed_date']
released.head(5)
Out[10]:
In [11]:
print('The percentage of released patient is :', (len(released) * 100)/ len(patient))
print('The percentage of deceased patient is :', (len(deceased) * 100)/ len(patient))
print('The percentage of isolated patient is :', (len(isolated) * 100)/ len(patient))
In [12]:
counts, bin_edges = np.histogram(deceased['age'], bins=10, density = True)
pdf = counts/(sum(counts))
cdf = np.cumsum(pdf)
plt.plot(bin_edges[1:], pdf)
plt.plot(bin_edges[1:], cdf, label= 'Death')
##
counts, bin_edges = np.histogram(released['age'], bins=10, density = True)
pdf = counts/(sum(counts))
cdf = np.cumsum(pdf)
plt.plot(bin_edges[1:], pdf)
plt.plot(bin_edges[1:], cdf, label = 'Recovered')
plt.xlabel('Age of patient')
plt.ylabel('Percentage')
plt.legend()
plt.grid()
plt.show()
In [13]:
plt.figure(figsize=(15,5))
patient.infected_by.value_counts().plot.bar().grid()
In [14]:
#선형 회귀 예측
#feature 선정
case_count_per_day = patient.groupby('confirmed_date').patient_id.count()
case_count_per_day = pd.DataFrame(case_count_per_day)
In [15]:
data = case_count_per_day.cumsum()
print(data)
dataset = data.iloc[16:] #16번 째부터 데이터 셋으로 사용
print(dataset)
In [16]:
days_in_future = 7
dates = pd.date_range('2020-2-20', '2020-3-11')
#reshape(-1, 1) 열을 1로 둠으로써 -1자리에 오는 행의 수를 추정한다.
future_y_pred = np.array([i for i in range(len(dates)+days_in_future)]).reshape(-1, 1)
y_pred = np.array([i for i in range(len(dates))]).reshape(-1,1)
print(len(dates))
In [17]:
x = np.array([i for i in range(len(dates))]).reshape(-1,1)
y = np.array(dataset).reshape(-1,1)
In [18]:
from sklearn.linear_model import LinearRegression
linear_model = LinearRegression()
linear_model.fit(x,y)
linear_pred = linear_model.predict(future_y_pred)
y_pred = linear_model.predict(y_pred)
r_sq = linear_model.score(x,y)
In [19]:
#Size of graph
plt.figure(figsize=(15,6))
#Plotting linear model predicted number case for each date(curent + future dates)
plt.plot(linear_pred, color='red', label='Predicted count')
#Plotting actual number of cases for each date
plt.plot(dataset, label='Actual count')
#Labeling X and Y axes.
plt.xlabel('Dates')
plt.ylabel('Total number of cases')
#Drawing a vertical line which touches linear model predicted last value
plt.vlines(x=len(linear_pred)-1, ymin=0, ymax=12000, linestyles='dotted')
plt.text(x=len(linear_pred)+2, y=5000, s='predicted no. of\ncases by next week',color='black',\
fontsize =15,horizontalalignment='center')
plt.xticks(rotation=90)
plt.legend()
plt.show()
반응형