R - 데이터 처리를 위한 패키지

dplyr 불러오기

require(dplyr)

Pipe operatior

x <- c(1,-1,2,-2,3,-3)
mean(abs(x))

abs(x) %>% mean()

filter 함수

iris %>% filter(Sepal.Length>5.5 & Petal.Length <6) %>% head()

select 함수

iris[names(iris) != "Sepal.Length" & names(iris) != "Sepal.Width"] #negative indexing이 불가능해서 구현
iris %>% select(-c(Sepal.Length, Sepal.Width)) #negative indexing이 가능

R기본 함수인 subset함수 사용

subset(iris, subset = Sepal.Length>5.5 & Petal.Length <6, select = -c(Sepal.Length, Sepal.Width))

위의 subset구문을 filter와 select로 구현

iris %>% filter(Sepal.Length>5.5 & Petal.Length <6) %>% select(-c(Sepal.Length,Sepal.Width))

mutate 함수

iris %>% transform(length_sum = sum(Sepal.Length+Petal.Length)) %>% head() #transform 이용
iris %>% mutate(length_sum = sum(Sepal.Length,Petal.Length)) %>% head()

mutate는 새로 만드는 변수를 참조해서 추가적인 새로운 변수를 또 만들 수 있음

summamrise 함수

iris %>% summarise(iris_mean = mean(c(Sepal.Length, Petal.Length)))

arrange 함수

iris %>% arrange(Sepal.Length, desc(Sepal.Width)) %>% head()
head(iris[order(iris$Sepal.Length, iris$Sepal.Width, decreasing = c(F,T)),])

group_by 함수

iris %>% group_by(Species) %>% mutate(iris_length = mean(Sepal.Length+ Sepal.Width))
iris %>% group_by(Species) %>% summarise(mean1 = mean(Sepal.Length), mean2 = mean(Petal.Length))

reshape2 패키지

require(reshape2)

melt 함수

melt(iris, id.vars = "Species", measure.vars = c("Sepal.Length", "Sepal.Width"))

cast 함수

iris_melt <- melt(iris, id.vars = "Species", measure.vars = c("Sepal.Length", "Sepal.Width"))
dcast(iris_melt, Species~variable, fun.aggregate = mean)
iris_melt %>% group_by(Species, variable) %>% summarise(mean = mean(value))

과제

data(airquality)
airquality <- airquality %>% mutate(windchill = 35.74+0.6215*Temp-35.75*Wind^(0.16)+0.4275*Temp*Wind^(0.16))
airquality %>% group_by(Month) %>% summarise(Ozone = mean(Ozone,na.rm = T), Solar.R = mean(Solar.R, na.rm =T), Wind = mean(Wind, na.rm = T), Temp = mean(Temp, na.rm =T))