#importing the required packages and csv file
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt
import warnings
import requests
from io import StringIO
warnings.filterwarnings('ignore')

orig_url='https://drive.google.com/file/d/1zvvT5vp9xiKJDXacnwvP0zQ1y1G1C6tz/view?usp=sharing'
file_id = orig_url.split('/')[-2]
dwn_url = 'https://drive.google.com/uc?export=download&id='+file_id
url = requests.get(dwn_url).text
csv_raw = StringIO(url)
patient = pd.read_csv(csv_raw)
patient.head(5)

patient.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 7869 entries, 0 to 7868
Data columns (total 15 columns):
patient_id          7869 non-null int64
sex                 679 non-null object
birth_year          666 non-null float64
country             7869 non-null object
region              437 non-null object
disease             28 non-null float64
group               86 non-null object
infection_reason    154 non-null object
infection_order     36 non-null float64
infected_by         70 non-null float64
contact_number      53 non-null float64
confirmed_date      7869 non-null object
released_date       56 non-null object
deceased_date       36 non-null object
state               7869 non-null object
dtypes: float64(5), int64(1), object(9)
memory usage: 922.3+ KB

patient['birth_year'].isna().sum()  #not available

7203

patient['birth_year'].notna().sum() + patient['birth_year'].isna().sum()

7869

patient['birth_year'].astype('Int64')

0       1984
1       1964
2       1966
3       1964
4       1987
        ... 
7864     NaN
7865     NaN
7866     NaN
7867     NaN
7868     NaN
Name: birth_year, Length: 7869, dtype: Int64

# Data pre-processing
patient['age'] = 2020 - patient['birth_year']

deceased = patient.loc[patient['state']=='deceased']
released = patient.loc[patient['state']=='released']
isolated = patient.loc[patient['state']=='isolated']

pd.to_datetime(deceased['confirmed_date'])

37     2020-02-18
53     2020-02-19
54     2020-02-19
103    2020-02-20
106    2020-02-21
113    2020-02-21
204    2020-02-22
285    2020-02-22
297    2020-02-22
390    2020-02-22
442    2020-02-23
619    2020-02-24
874    2020-02-25
900    2020-02-25
924    2020-02-25
1063   2020-02-26
1287   2020-02-27
1442   2020-02-27
1920   2020-02-28
2084   2020-02-28
2288   2020-02-28
2578   2020-02-29
2613   2020-02-29
2768   2020-02-29
2935   2020-02-29
3549   2020-03-01
3577   2020-03-01
3895   2020-03-01
3913   2020-03-01
4045   2020-03-01
5142   2020-03-03
5459   2020-03-04
5766   2020-03-05
5767   2020-03-05
6284   2020-03-06
7195   2020-03-08
Name: confirmed_date, dtype: datetime64[ns]

#Adding one more feature
date_column = ['confirmed_date','deceased_date']   #datetime형으로 캐스팅
for i in date_column:
    deceased[i] = pd.to_datetime(deceased[i]) 
deceased['no_of_days_survived']= deceased['deceased_date'] - deceased['confirmed_date']  #확정되고 죽은날짜까지 걸린시간
deceased.head(5)

date_column = ['confirmed_date','released_date']
for i in date_column:
    released[i] = pd.to_datetime(released[i])
released['no_of_days_treated'] = released['released_date'] - released['confirmed_date']
released.head(5)

print('The percentage of released patient is :', (len(released) * 100)/ len(patient))
print('The percentage of deceased patient is :', (len(deceased) * 100)/ len(patient))
print('The percentage of isolated patient is :', (len(isolated) * 100)/ len(patient))

The percentage of released patient is : 0.7116533231668573
The percentage of deceased patient is : 0.4574914220358368
The percentage of isolated patient is : 98.8308552547973

counts, bin_edges = np.histogram(deceased['age'], bins=10, density = True)
pdf = counts/(sum(counts))
cdf = np.cumsum(pdf)
plt.plot(bin_edges[1:], pdf)
plt.plot(bin_edges[1:], cdf, label= 'Death')
##

counts, bin_edges = np.histogram(released['age'], bins=10, density = True)
pdf = counts/(sum(counts))
cdf = np.cumsum(pdf)
plt.plot(bin_edges[1:], pdf)
plt.plot(bin_edges[1:], cdf, label = 'Recovered')

plt.xlabel('Age of patient')
plt.ylabel('Percentage')
plt.legend()
plt.grid()
plt.show()

plt.figure(figsize=(15,5))
patient.infected_by.value_counts().plot.bar().grid()

#선형 회귀 예측

#feature 선정
case_count_per_day = patient.groupby('confirmed_date').patient_id.count()
case_count_per_day = pd.DataFrame(case_count_per_day)

data = case_count_per_day.cumsum()
print(data)
dataset = data.iloc[16:]          #16번 째부터 데이터 셋으로 사용
print(dataset)

                patient_id
confirmed_date            
2020-01-20               1
2020-01-24               2
2020-01-26               3
2020-01-27               4
2020-01-30               7
2020-01-31              11
2020-02-01              12
2020-02-02              15
2020-02-04              16
2020-02-05              21
2020-02-06              24
2020-02-09              27
2020-02-10              28
2020-02-16              30
2020-02-18              39
2020-02-19              66
2020-02-20             104
2020-02-21             204
2020-02-22             433
2020-02-23             602
2020-02-24             833
2020-02-25             976
2020-02-26            1263
2020-02-27            1767
2020-02-28            2338
2020-02-29            3150
2020-03-01            4212
2020-03-02            4812
2020-03-03            5328
2020-03-04            5766
2020-03-05            6284
2020-03-06            6767
2020-03-07            7134
2020-03-08            7382
2020-03-09            7513
2020-03-10            7755
2020-03-11            7869
                patient_id
confirmed_date            
2020-02-20             104
2020-02-21             204
2020-02-22             433
2020-02-23             602
2020-02-24             833
2020-02-25             976
2020-02-26            1263
2020-02-27            1767
2020-02-28            2338
2020-02-29            3150
2020-03-01            4212
2020-03-02            4812
2020-03-03            5328
2020-03-04            5766
2020-03-05            6284
2020-03-06            6767
2020-03-07            7134
2020-03-08            7382
2020-03-09            7513
2020-03-10            7755
2020-03-11            7869

days_in_future = 7
dates = pd.date_range('2020-2-20', '2020-3-11')
#reshape(-1, 1)  열을 1로 둠으로써 -1자리에 오는 행의 수를 추정한다.
future_y_pred = np.array([i for i in range(len(dates)+days_in_future)]).reshape(-1, 1)

y_pred = np.array([i for i in range(len(dates))]).reshape(-1,1)
print(len(dates))

21

x = np.array([i for i in range(len(dates))]).reshape(-1,1)
y = np.array(dataset).reshape(-1,1)

from sklearn.linear_model import LinearRegression

linear_model = LinearRegression()
linear_model.fit(x,y)
linear_pred = linear_model.predict(future_y_pred)
y_pred = linear_model.predict(y_pred)
r_sq = linear_model.score(x,y)

#Size of graph
plt.figure(figsize=(15,6))

#Plotting linear model predicted number case for each date(curent + future dates)
plt.plot(linear_pred, color='red', label='Predicted count')

#Plotting actual number of cases for each date
plt.plot(dataset, label='Actual count')


#Labeling X and Y axes.
plt.xlabel('Dates')
plt.ylabel('Total number of cases')

#Drawing a vertical line which touches linear model predicted last value
plt.vlines(x=len(linear_pred)-1, ymin=0, ymax=12000, linestyles='dotted')
plt.text(x=len(linear_pred)+2, y=5000, s='predicted no. of\ncases by next week',color='black',\
         fontsize =15,horizontalalignment='center') 
plt.xticks(rotation=90)

plt.legend()
plt.show()

[pandas/Selenium/BeautifulSoup4] 야구 시즌 기록 데이터(STATIZ) 웹 크롤링 후 DataFrame 만들기. DataFrame을 csv로 만들고 csv파일 로컬에 저장하기! (feat. Colab) (6)	2020.09.08
[pandas] 공공 데이터 XML 크롤링을 통해 dataFrame으로 만들어보기 (0)	2020.09.04
spark를 이용해서 Missing Data다루기 (0)	2020.04.29
spark를 이용해서 Sales 정보 다루기(using groupBy, orderBy) (0)	2020.04.23
spark를 이용해서 삼성전자 주식 분석하기 (0)	2020.04.15

EI_HJ

[pandas] 코로나 수치 예측하기 (feat. Linear Regression)

'빅데이터 | 머신러닝 | 딥러닝 > 빅데이터 분석' 카테고리의 다른 글

'빅데이터 | 머신러닝 | 딥러닝/빅데이터 분석'의 다른글

티스토리툴바

	patient_id	sex	birth_year	country	region	disease	group	infection_reason	infection_order	infected_by	contact_number	confirmed_date	released_date	deceased_date	state	age	no_of_days_treated
0	1	female	1984.0	China	filtered at airport	NaN	NaN	visit to Wuhan	1.0	NaN	45.0	2020-01-20	2020-02-06	NaN	released	36.0	17 days
1	2	male	1964.0	Korea	filtered at airport	NaN	NaN	visit to Wuhan	1.0	NaN	75.0	2020-01-24	2020-02-05	NaN	released	56.0	12 days
2	3	male	1966.0	Korea	capital area	NaN	NaN	visit to Wuhan	1.0	NaN	16.0	2020-01-26	2020-02-12	NaN	released	54.0	17 days
3	4	male	1964.0	Korea	capital area	NaN	NaN	visit to Wuhan	1.0	NaN	95.0	2020-01-27	2020-02-09	NaN	released	56.0	13 days
4	5	male	1987.0	Korea	capital area	NaN	NaN	visit to Wuhan	1.0	NaN	31.0	2020-01-30	2020-03-02	NaN	released	33.0	32 days

« 2024/05 »
일	월	화	수	목	금	토
			1	2	3	4
5	6	7	8	9	10	11
12	13	14	15	16	17	18
19	20	21	22	23	24	25
26	27	28	29	30	31

[pandas] 코로나 수치 예측하기 (feat. Linear Regression)

'빅데이터 | 머신러닝 | 딥러닝 > 빅데이터 분석' 카테고리의 다른 글

'빅데이터 | 머신러닝 | 딥러닝/빅데이터 분석'의 다른글

관련글

티스토리툴바