빅데이터 | 머신러닝 | 딥러닝/빅데이터 분석

spark를 이용해서 삼성전자 주식 분석하기

냠냠:) 2020. 4. 15. 05:33

[수행 중 나타났던 Unsupported class file major version 55 해결방법]

자바 설치하기 sudo apt-get install openjdk-8-jdk

새로운 자바 버전으로 바꾸기 sudo update-alternatives --config java 

 

 

In [3]:
import findspark
#스파크를 찾을 때 사용하는 모듈
In [4]:
findspark.init('/home/hak/spark-2.4.5-bin-hadoop2.7/')
#내 스파크의 경로
In [5]:
from pyspark.sql import SparkSession
# Dataframe을 sql처럼 사용하기 위해 모듈 받기
In [6]:
spark = SparkSession.builder.appName('Basics').getOrCreate()
In [7]:
df = spark.read.csv('Samsung_stock.csv', inferSchema=True, header=True)
# 야후에서 받아온 삼성 근 5년간의 주식 변환 정보
In [8]:
df.printSchema()
# printSchema로 해당 csv파일의 스키마 즉, 데이터 타입들을 볼 즉 데이터 타입들을 볼 수 있음
 
root
 |-- Date: timestamp (nullable = true)
 |-- Open: string (nullable = true)
 |-- High: string (nullable = true)
 |-- Low: string (nullable = true)
 |-- Close: string (nullable = true)
 |-- Adj Close: string (nullable = true)
 |-- Volume: string (nullable = true)

In [44]:
from pyspark.sql.types import StructField,StringType,DoubleType,IntegerType,TimestampType,StructType
In [27]:
data_schema = StructType([
        StructField('Date',TimestampType(),True),
        StructField('Open',DoubleType(),True),
        StructField('High',DoubleType(),True),
        StructField('Low',DoubleType(),True),
        StructField('Close',DoubleType(),True),
        StructField('Volume',IntegerType(),True),
        StructField('Adj Close',DoubleType(),True)
    ])
In [38]:
df = spark.read.csv('Samsung_stock.csv', header=True, schema=data_schema)
In [45]:
df.printSchema()
 
root
 |-- Date: timestamp (nullable = true)
 |-- Open: string (nullable = true)
 |-- High: string (nullable = true)
 |-- Low: string (nullable = true)
 |-- Close: string (nullable = true)
 |-- Adj Close: string (nullable = true)
 |-- Volume: string (nullable = true)

In [9]:
df.show()
# show() 메서드를 사용하여 데이터를 조회 가능
 
+-------------------+------------+------------+------------+------------+------------+--------+
|               Date|        Open|        High|         Low|       Close|   Adj Close|  Volume|
+-------------------+------------+------------+------------+------------+------------+--------+
|2019-04-15 00:00:00|47150.000000|47500.000000|47000.000000|47050.000000|47050.000000| 8733019|
|2019-04-16 00:00:00|47400.000000|47400.000000|46800.000000|47250.000000|47250.000000| 7730881|
|2019-04-17 00:00:00|47300.000000|47600.000000|47000.000000|47050.000000|47050.000000| 5513658|
|2019-04-18 00:00:00|47200.000000|47250.000000|45500.000000|45600.000000|45600.000000|12808747|
|2019-04-19 00:00:00|45750.000000|46000.000000|45250.000000|45300.000000|45300.000000| 8479710|
|2019-04-22 00:00:00|45400.000000|45900.000000|45100.000000|45350.000000|45350.000000| 6006957|
|2019-04-23 00:00:00|45050.000000|45500.000000|45000.000000|45200.000000|45200.000000| 6920566|
|2019-04-24 00:00:00|45400.000000|45650.000000|44150.000000|44750.000000|44750.000000|13299267|
|2019-04-25 00:00:00|44250.000000|45000.000000|44100.000000|44650.000000|44650.000000|10868965|
|2019-04-26 00:00:00|44200.000000|45000.000000|43800.000000|44850.000000|44850.000000| 9739982|
|2019-04-29 00:00:00|45150.000000|46150.000000|45100.000000|46150.000000|46150.000000| 8777517|
|2019-04-30 00:00:00|46000.000000|46300.000000|45350.000000|45850.000000|45850.000000|11121273|
|2019-05-02 00:00:00|45500.000000|46150.000000|45400.000000|45900.000000|45900.000000| 8625126|
|2019-05-03 00:00:00|45900.000000|46050.000000|45300.000000|45300.000000|45300.000000| 6562916|
|2019-05-07 00:00:00|45250.000000|45300.000000|44400.000000|44850.000000|44850.000000|12014907|
|2019-05-08 00:00:00|44300.000000|44850.000000|44200.000000|44250.000000|44250.000000|10398754|
|2019-05-09 00:00:00|43900.000000|44250.000000|42450.000000|42450.000000|42450.000000|23029718|
|2019-05-10 00:00:00|42600.000000|43450.000000|42450.000000|42900.000000|42900.000000|14579512|
|2019-05-13 00:00:00|42500.000000|43200.000000|42350.000000|42650.000000|42650.000000| 7635079|
|2019-05-14 00:00:00|41300.000000|43100.000000|41300.000000|42650.000000|42650.000000|11563730|
+-------------------+------------+------------+------------+------------+------------+--------+
only showing top 20 rows

In [10]:
type(df['Open'])
#type 메서드를 사용해 데이터프레임의 열에 해당하는 타입을 볼 수 있음
Out[10]:
pyspark.sql.column.Column
In [11]:
type(df.select('Open'))
#type 메서드를 사용해 데이터프레임의 선택한 열이 가지고있는 데이터 타입을 볼 수 있음
Out[11]:
pyspark.sql.dataframe.DataFrame
In [12]:
df.select(['Date','Close']).show()
#select로 다중 열을 조회할 수 있음
 
+-------------------+------------+
|               Date|       Close|
+-------------------+------------+
|2019-04-15 00:00:00|47050.000000|
|2019-04-16 00:00:00|47250.000000|
|2019-04-17 00:00:00|47050.000000|
|2019-04-18 00:00:00|45600.000000|
|2019-04-19 00:00:00|45300.000000|
|2019-04-22 00:00:00|45350.000000|
|2019-04-23 00:00:00|45200.000000|
|2019-04-24 00:00:00|44750.000000|
|2019-04-25 00:00:00|44650.000000|
|2019-04-26 00:00:00|44850.000000|
|2019-04-29 00:00:00|46150.000000|
|2019-04-30 00:00:00|45850.000000|
|2019-05-02 00:00:00|45900.000000|
|2019-05-03 00:00:00|45300.000000|
|2019-05-07 00:00:00|44850.000000|
|2019-05-08 00:00:00|44250.000000|
|2019-05-09 00:00:00|42450.000000|
|2019-05-10 00:00:00|42900.000000|
|2019-05-13 00:00:00|42650.000000|
|2019-05-14 00:00:00|42650.000000|
+-------------------+------------+
only showing top 20 rows

In [13]:
df.createOrReplaceTempView("samsung")
#createOrReplaceTempView를 이용해 뷰를 만들 수 있따.
In [14]:
sql_results = spark.sql("Select * from samsung")
In [15]:
sql_results.show()
 
+-------------------+------------+------------+------------+------------+------------+--------+
|               Date|        Open|        High|         Low|       Close|   Adj Close|  Volume|
+-------------------+------------+------------+------------+------------+------------+--------+
|2019-04-15 00:00:00|47150.000000|47500.000000|47000.000000|47050.000000|47050.000000| 8733019|
|2019-04-16 00:00:00|47400.000000|47400.000000|46800.000000|47250.000000|47250.000000| 7730881|
|2019-04-17 00:00:00|47300.000000|47600.000000|47000.000000|47050.000000|47050.000000| 5513658|
|2019-04-18 00:00:00|47200.000000|47250.000000|45500.000000|45600.000000|45600.000000|12808747|
|2019-04-19 00:00:00|45750.000000|46000.000000|45250.000000|45300.000000|45300.000000| 8479710|
|2019-04-22 00:00:00|45400.000000|45900.000000|45100.000000|45350.000000|45350.000000| 6006957|
|2019-04-23 00:00:00|45050.000000|45500.000000|45000.000000|45200.000000|45200.000000| 6920566|
|2019-04-24 00:00:00|45400.000000|45650.000000|44150.000000|44750.000000|44750.000000|13299267|
|2019-04-25 00:00:00|44250.000000|45000.000000|44100.000000|44650.000000|44650.000000|10868965|
|2019-04-26 00:00:00|44200.000000|45000.000000|43800.000000|44850.000000|44850.000000| 9739982|
|2019-04-29 00:00:00|45150.000000|46150.000000|45100.000000|46150.000000|46150.000000| 8777517|
|2019-04-30 00:00:00|46000.000000|46300.000000|45350.000000|45850.000000|45850.000000|11121273|
|2019-05-02 00:00:00|45500.000000|46150.000000|45400.000000|45900.000000|45900.000000| 8625126|
|2019-05-03 00:00:00|45900.000000|46050.000000|45300.000000|45300.000000|45300.000000| 6562916|
|2019-05-07 00:00:00|45250.000000|45300.000000|44400.000000|44850.000000|44850.000000|12014907|
|2019-05-08 00:00:00|44300.000000|44850.000000|44200.000000|44250.000000|44250.000000|10398754|
|2019-05-09 00:00:00|43900.000000|44250.000000|42450.000000|42450.000000|42450.000000|23029718|
|2019-05-10 00:00:00|42600.000000|43450.000000|42450.000000|42900.000000|42900.000000|14579512|
|2019-05-13 00:00:00|42500.000000|43200.000000|42350.000000|42650.000000|42650.000000| 7635079|
|2019-05-14 00:00:00|41300.000000|43100.000000|41300.000000|42650.000000|42650.000000|11563730|
+-------------------+------------+------------+------------+------------+------------+--------+
only showing top 20 rows

In [16]:
df.describe()
Out[16]:
DataFrame[summary: string, Open: string, High: string, Low: string, Close: string, Adj Close: string, Volume: string]
In [80]:
df.filter(df['Close'] >60000).show()
#filter을 이용하면 각 열에 대해 Sql 방식이 아니더라도 조건을 부여하여 원하는 행을 볼 수 있음
 
+-------------------+------------+------------+------------+------------+------------+--------+
|               Date|        Open|        High|         Low|       Close|   Adj Close|  Volume|
+-------------------+------------+------------+------------+------------+------------+--------+
|2020-01-16 00:00:00|59100.000000|60700.000000|59000.000000|60700.000000|60700.000000|14381774|
|2020-01-17 00:00:00|61900.000000|62000.000000|61000.000000|61300.000000|61300.000000|16025661|
|2020-01-20 00:00:00|62000.000000|62800.000000|61700.000000|62400.000000|62400.000000|12528855|
|2020-01-21 00:00:00|62000.000000|62400.000000|61200.000000|61400.000000|61400.000000|11142693|
|2020-01-22 00:00:00|60500.000000|62600.000000|60400.000000|62300.000000|62300.000000|15339565|
|2020-01-23 00:00:00|61800.000000|61800.000000|60700.000000|60800.000000|60800.000000|14916555|
|2020-02-06 00:00:00|60100.000000|61100.000000|59700.000000|61100.000000|61100.000000|14727159|
|2020-02-07 00:00:00|61100.000000|61200.000000|59700.000000|60400.000000|60400.000000|16402493|
|2020-02-12 00:00:00|60300.000000|60700.000000|59700.000000|60500.000000|60500.000000|12904207|
|2020-02-13 00:00:00|61200.000000|61600.000000|60500.000000|60700.000000|60700.000000|18449775|
|2020-02-14 00:00:00|60900.000000|61900.000000|60200.000000|61800.000000|61800.000000|13276067|
|2020-02-17 00:00:00|61600.000000|62000.000000|61200.000000|61500.000000|61500.000000| 8740596|
|2020-02-19 00:00:00|59800.000000|60400.000000|59400.000000|60200.000000|60200.000000|12951496|
+-------------------+------------+------------+------------+------------+------------+--------+

In [84]:
df.filter((df['Close'] > 55000) & (df['Close'] <60000)).show()
#이때 괄호에 조심해야 한다.
 
+-------------------+------------+------------+------------+------------+------------+--------+
|               Date|        Open|        High|         Low|       Close|   Adj Close|  Volume|
+-------------------+------------+------------+------------+------------+------------+--------+
|2019-12-17 00:00:00|55800.000000|56700.000000|55400.000000|56700.000000|56700.000000|19358433|
|2019-12-18 00:00:00|56700.000000|57200.000000|56000.000000|56300.000000|56300.000000|15558208|
|2019-12-19 00:00:00|57000.000000|57300.000000|55500.000000|56000.000000|56000.000000|14180520|
|2019-12-20 00:00:00|56100.000000|56500.000000|55600.000000|56000.000000|56000.000000|12095519|
|2019-12-23 00:00:00|56100.000000|56400.000000|55100.000000|55500.000000|55500.000000| 9839252|
|2019-12-26 00:00:00|54700.000000|55400.000000|54400.000000|55400.000000|55400.000000| 9645034|
|2019-12-27 00:00:00|55700.000000|56900.000000|55500.000000|56500.000000|56500.000000|12313056|
|2019-12-30 00:00:00|56200.000000|56600.000000|55700.000000|55800.000000|55800.000000| 8356767|
|2020-01-02 00:00:00|55500.000000|56000.000000|55000.000000|55200.000000|55200.000000|12993228|
|2020-01-03 00:00:00|56000.000000|56600.000000|54900.000000|55500.000000|55500.000000|15422255|
|2020-01-06 00:00:00|54900.000000|55600.000000|54600.000000|55500.000000|55500.000000|10278951|
|2020-01-07 00:00:00|55700.000000|56400.000000|55600.000000|55800.000000|55800.000000|10009778|
|2020-01-08 00:00:00|56200.000000|57400.000000|55900.000000|56800.000000|56800.000000|23501171|
|2020-01-09 00:00:00|58400.000000|58600.000000|57400.000000|58600.000000|58600.000000|24102579|
|2020-01-10 00:00:00|58800.000000|59700.000000|58300.000000|59500.000000|59500.000000|16000170|
|2020-01-15 00:00:00|59500.000000|59600.000000|58900.000000|59000.000000|59000.000000|14300928|
|2020-01-28 00:00:00|59400.000000|59400.000000|58300.000000|58800.000000|58800.000000|23664541|
|2020-01-29 00:00:00|59100.000000|59700.000000|58800.000000|59100.000000|59100.000000|16446102|
|2020-01-30 00:00:00|58800.000000|58800.000000|56800.000000|57200.000000|57200.000000|20821939|
|2020-01-31 00:00:00|57800.000000|58400.000000|56400.000000|56400.000000|56400.000000|19749457|
+-------------------+------------+------------+------------+------------+------------+--------+
only showing top 20 rows

In [17]:
result = df.filter(df["Low"] > 60000).collect()
#조건을 부여한 행에 collect메서드를 사용하여 리스트 형식으로 가져올 수 있음
In [18]:
type(result[0])
Out[18]:
pyspark.sql.types.Row
In [19]:
result[0].asDict()
#dict타입으로 뽑은 데이터의 값을 확인
#for문 등으로 활용 가능
Out[19]:
{'Adj Close': '61300.000000',
 'Close': '61300.000000',
 'Date': datetime.datetime(2020, 1, 17, 0, 0),
 'High': '62000.000000',
 'Low': '61000.000000',
 'Open': '61900.000000',
 'Volume': '16025661'}

 

 

반응형