import findspark

findspark.init('/home/hak/spark-2.4.5-bin-hadoop2.7/') #spark찾기

from pyspark.sql import SparkSession

spark = SparkSession.builder.appName('groupby').getOrCreate()

df = spark.read.csv('SalesJan2009.csv', inferSchema=True, header=True) #csv불러오기

df.printSchema() #초기스키마

root |-- Transaction_date: string (nullable = true) |-- Product: string (nullable = true) |-- Price: string (nullable = true) |-- Payment_Type: string (nullable = true) |-- Name: string (nullable = true) |-- City: string (nullable = true) |-- State: string (nullable = true) |-- Country: string (nullable = true) |-- Account_Created: string (nullable = true) |-- Last_Login: string (nullable = true) |-- Latitude: double (nullable = true) |-- Longitude: double (nullable = true)

df.show()

+----------------+--------+-----+------------+-----------------+--------------------+--------------+--------------+---------------+------------+-----------+----------+ |Transaction_date| Product|Price|Payment_Type| Name| City| State| Country|Account_Created| Last_Login| Latitude| Longitude| +----------------+--------+-----+------------+-----------------+--------------------+--------------+--------------+---------------+------------+-----------+----------+ | 1/2/09 6:17|Product1| 1200| Mastercard| carolina| Basildon| England|United Kingdom| 1/2/09 6:00| 1/2/09 6:08| 51.5|-1.1166667| | 1/2/09 4:53|Product1| 1200| Visa| Betina|Parkville ...| MO| United States| 1/2/09 4:42| 1/2/09 7:49| 39.195| -94.68194| | 1/2/09 13:08|Product1| 1200| Mastercard|Federica e Andrea|Astoria ...| OR| United States| 1/1/09 16:21|1/3/09 12:32| 46.18806| -123.83| | 1/3/09 14:44|Product1| 1200| Visa| Gouya| Echuca| Victoria| Australia| 9/25/05 21:13|1/3/09 14:22|-36.1333333| 144.75| | 1/4/09 12:56|Product2| 3600| Visa| Gerd W |Cahaba Heights ...| AL| United States| 11/15/08 15:47|1/4/09 12:45| 33.52056| -86.8025| | 1/4/09 13:19|Product1| 1200| Visa| LAURENCE|Mickleton ...| NJ| United States| 9/24/08 15:19|1/4/09 13:04| 39.79| -75.23806| | 1/4/09 20:11|Product1| 1200| Mastercard| Fleur|Peoria ...| IL| United States| 1/3/09 9:38|1/4/09 19:45| 40.69361| -89.58889| | 1/2/09 20:09|Product1| 1200| Mastercard| adam|Martin ...| TN| United States| 1/2/09 17:43|1/4/09 20:01| 36.34333| -88.85028| | 1/4/09 13:17|Product1| 1200| Mastercard| Renee Elisabeth| Tel Aviv| Tel Aviv| Israel| 1/4/09 13:03|1/4/09 22:10| 32.0666667|34.7666667| | 1/4/09 14:11|Product1| 1200| Visa| Aidan| Chatou| Ile-de-France| France| 6/3/08 4:22| 1/5/09 1:17| 48.8833333| 2.15| | 1/5/09 2:42|Product1| 1200| Diners| Stacy|New York ...| NY| United States| 1/5/09 2:23| 1/5/09 4:59| 40.71417| -74.00639| | 1/5/09 5:39|Product1| 1200| Amex| Heidi| Eindhoven| Noord-Brabant| Netherlands| 1/5/09 4:55| 1/5/09 8:15| 51.45| 5.4666667| | 1/2/09 9:16|Product1| 1200| Mastercard| Sean |Shavano Park ...| TX| United States| 1/2/09 8:32| 1/5/09 9:05| 29.42389| -98.49333| | 1/5/09 10:08|Product1| 1200| Visa| Georgia|Eagle ...| ID| United States| 11/11/08 15:53|1/5/09 10:05| 43.69556|-116.35306| | 1/2/09 14:18|Product1| 1200| Visa| Richard|Riverside ...| NJ| United States| 12/9/08 12:07|1/5/09 11:01| 40.03222| -74.95778| | 1/4/09 1:05|Product1| 1200| Diners| Leanne| Julianstown| Meath| Ireland| 1/4/09 0:00|1/5/09 13:36| 53.6772222|-6.3191667| | 1/5/09 11:37|Product1| 1200| Visa| Janet| Ottawa| Ontario| Canada| 1/5/09 9:35|1/5/09 19:24| 45.4166667| -75.7| | 1/6/09 5:02|Product1| 1200| Diners| barbara| Hyderabad|Andhra Pradesh| India| 1/6/09 2:41| 1/6/09 7:52| 17.3833333|78.4666667| | 1/6/09 7:45|Product2| 3600| Visa| Sabine| London| England|United Kingdom| 1/6/09 7:00| 1/6/09 9:17| 51.52721| 0.14559| | 1/2/09 7:35|Product1| 1200| Diners| Hani|Salt Lake City ...| UT| United States| 12/30/08 5:44|1/6/09 10:52| 40.76083|-111.89028| +----------------+--------+-----+------------+-----------------+--------------------+--------------+--------------+---------------+------------+-----------+----------+ only showing top 20 rows

df = df.select("Transaction_date","Product","Price") #필요한 열들만 모아서 다시 정의

df.show()

+----------------+--------+-----+ |Transaction_date| Product|Price| +----------------+--------+-----+ | 1/2/09 6:17|Product1| 1200| | 1/2/09 4:53|Product1| 1200| | 1/2/09 13:08|Product1| 1200| | 1/3/09 14:44|Product1| 1200| | 1/4/09 12:56|Product2| 3600| | 1/4/09 13:19|Product1| 1200| | 1/4/09 20:11|Product1| 1200| | 1/2/09 20:09|Product1| 1200| | 1/4/09 13:17|Product1| 1200| | 1/4/09 14:11|Product1| 1200| | 1/5/09 2:42|Product1| 1200| | 1/5/09 5:39|Product1| 1200| | 1/2/09 9:16|Product1| 1200| | 1/5/09 10:08|Product1| 1200| | 1/2/09 14:18|Product1| 1200| | 1/4/09 1:05|Product1| 1200| | 1/5/09 11:37|Product1| 1200| | 1/6/09 5:02|Product1| 1200| | 1/6/09 7:45|Product2| 3600| | 1/2/09 7:35|Product1| 1200| +----------------+--------+-----+ only showing top 20 rows

df.printSchema()

root |-- Transaction_date: string (nullable = true) |-- Product: string (nullable = true) |-- Price: string (nullable = true)

from pyspark.sql.types import DoubleType

df = df.withColumn('Price',df['Price'].cast(DoubleType())) #Price가 문자열이면 계산하기 어려우므로 Double형으로 cast

df.groupBy('Product').min().show() #제품 별 최소 가격

+---------+----------+ | Product|min(Price)| +---------+----------+ | Product3| 7500.0| | Product2| 3600.0| | Product1| 250.0| |Product3 | 7500.0| +---------+----------+

df.agg({'Price':"max"}).show() #price중 최댓 값

+----------+ |max(Price)| +----------+ | 7500.0| +----------+

grouped =df.groupBy('Product') #Product열만 groupy해서 가져오기

grouped.agg({'Price':'max'}).show() #각 그룹 별 최댓값 가져오기

+---------+----------+ | Product|max(Price)| +---------+----------+ | Product3| 7500.0| | Product2| 3600.0| | Product1| 2100.0| |Product3 | 7500.0| +---------+----------+

from pyspark.sql.functions import countDistinct, avg,stddev #다양한 연산을 위해 모듈 import

df.select(countDistinct('Product')).show() #Distinct= 중복 제거 즉, Product의 개수를 센다.

+-----------------------+ |count(DISTINCT Product)| +-----------------------+ | 4| +-----------------------+

df.select(countDistinct('Product').alias('Distinct Product')).show() #alias를 통해서 열값 바꿔주기

+----------------+ |Distinct Product| +----------------+ | 4| +----------------+

df.select(avg('Price')).show() #평균 값

+------------------+ | avg(Price)| +------------------+ |1622.3671013039118| +------------------+

df.select(stddev('Price')).show() #표준 편차

+------------------+ |stddev_samp(Price)| +------------------+ |1099.0531766851177| +------------------+

from pyspark.sql.functions import format_number

Price_std = df.select(stddev('Price').alias('std'))

Price_std.show()

+------------------+ | std| +------------------+ |1099.0531766851177| +------------------+

Price_std.select(format_number('std',2).alias('std')).show() #소수점 2자리까지 표현하기

+--------+ | std| +--------+ |1,099.05| +--------+

df.orderBy('Price').show() #Price를 기준으로 오름차순으로 정렬하기

+----------------+--------+------+ |Transaction_date| Product| Price| +----------------+--------+------+ | 1/28/09 18:00|Product1| null| | 1/30/09 7:42|Product1| 250.0| | 1/21/09 11:28|Product1| 800.0| | 1/1/09 11:05|Product1|1200.0| | 1/9/09 15:58|Product1|1200.0| | 1/5/09 4:10|Product1|1200.0| | 1/6/09 7:18|Product1|1200.0| | 1/2/09 1:11|Product1|1200.0| | 1/1/09 2:24|Product1|1200.0| | 1/2/09 4:53|Product1|1200.0| | 1/2/09 2:57|Product1|1200.0| | 1/3/09 14:44|Product1|1200.0| | 1/8/09 0:42|Product1|1200.0| | 1/4/09 13:19|Product1|1200.0| | 1/2/09 20:09|Product1|1200.0| | 1/7/09 8:08|Product1|1200.0| | 1/4/09 14:11|Product1|1200.0| | 1/5/09 5:39|Product1|1200.0| | 1/1/09 20:21|Product1|1200.0| | 1/5/09 10:08|Product1|1200.0| +----------------+--------+------+ only showing top 20 rows

기df.orderBy(df['Price'].desc()).show() #desc는 내림차순

+----------------+---------+------+ |Transaction_date| Product| Price| +----------------+---------+------+ | 1/16/09 2:41| Product3|7500.0| | 1/11/09 4:29| Product3|7500.0| | 1/25/09 16:11| Product3|7500.0| | 1/20/09 5:24|Product3 |7500.0| | 1/25/09 11:35| Product3|7500.0| | 1/17/09 7:56| Product3|7500.0| | 1/17/09 7:58| Product3|7500.0| | 1/22/09 12:45| Product3|7500.0| | 1/1/09 4:24| Product3|7500.0| | 1/25/09 1:53| Product3|7500.0| | 1/5/09 2:57| Product3|7500.0| | 1/15/09 10:16| Product3|7500.0| | 1/18/09 13:26| Product3|7500.0| | 1/12/09 5:50| Product3|7500.0| | 1/14/09 10:32| Product3|7500.0| | 1/6/09 22:19| Product2|3600.0| | 1/5/09 8:58| Product2|3600.0| | 1/18/09 13:49| Product2|3600.0| | 1/6/09 7:45| Product2|3600.0| | 1/4/09 12:56| Product2|3600.0| +----------------+---------+------+ only showing top 20 rows

느낀 점 : 데이터 베이스의 SQL과 다를게 없는 기능을 보여주지만 pyspark.sql은 처음 다루기 때문에 조금 어색한 부분이 없지 않아 있다. 프로그램마다 배우는 연산기능, 데이터 활용 방법들을 익숙하게 다룰 수 있다면은 좀 더 편하게 다룰 수 있을 것 같고 데이터를 추출하는 부분에서 많은 이점이 있을 것 같다.

[pandas/Selenium/BeautifulSoup4] 야구 시즌 기록 데이터(STATIZ) 웹 크롤링 후 DataFrame 만들기. DataFrame을 csv로 만들고 csv파일 로컬에 저장하기! (feat. Colab) (6)	2020.09.08
[pandas] 공공 데이터 XML 크롤링을 통해 dataFrame으로 만들어보기 (0)	2020.09.04
[pandas] 코로나 수치 예측하기 (feat. Linear Regression) (0)	2020.05.03
spark를 이용해서 Missing Data다루기 (0)	2020.04.29
spark를 이용해서 삼성전자 주식 분석하기 (0)	2020.04.15

EI_HJ

spark를 이용해서 Sales 정보 다루기(using groupBy, orderBy)

'빅데이터 | 머신러닝 | 딥러닝 > 빅데이터 분석' 카테고리의 다른 글

'빅데이터 | 머신러닝 | 딥러닝/빅데이터 분석'의 다른글

티스토리툴바

« 2024/05 »
일	월	화	수	목	금	토
			1	2	3	4
5	6	7	8	9	10	11
12	13	14	15	16	17	18
19	20	21	22	23	24	25
26	27	28	29	30	31

spark를 이용해서 Sales 정보 다루기(using groupBy, orderBy)

'빅데이터 | 머신러닝 | 딥러닝 > 빅데이터 분석' 카테고리의 다른 글

'빅데이터 | 머신러닝 | 딥러닝/빅데이터 분석'의 다른글

관련글

티스토리툴바