# airquality NA outlier 처리
# 1. NA outlier 확인
# NA - Ozone Solar.R
# 이상치 - Ozone Wind
summary(airquality)
sum(is.na(airquality))
boxplot(airquality)
# 2. NA 처리
median(airquality$Ozone) # NA ??????? 31.5
fivenum(airquality$Ozone)
median(airquality$Solar.R) # NA ????? 205.0
airquality.clean <- airquality
airquality.clean$Ozone <- ifelse(is.na(airquality.clean$Ozone),
31.5,
airquality.clean$Ozone)
airquality.clean$Solar.R <- ifelse(is.na(airquality.clean$Solar.R),
205.0,
airquality.clean$Solar.R)
summary(airquality.clean)
sum(is.na(airquality.clean))
# 3. outlier 처리 : 최대값으로 대체
boxplot(airquality.clean) # 중위수 대체를 하니 Ozone outlier가 많이 증가
(Ozone.q1 <- fivenum(airquality.clean$Ozone)[2])
(Ozone.q3 <- fivenum(airquality.clean$Ozone)[4])
Ozone.iqr <- Ozone.q3 - Ozone.q1
Ozone.min <- Ozone.q1 - 1.5* Ozone.iqr
Ozone.max <- Ozone.q3 + 1.5* Ozone.iqr
airquality.clean$Ozone <- ifelse(airquality.clean$Ozone > Ozone.max,
Ozone.max,
airquality.clean$Ozone)
(Wind.q1 <- fivenum(airquality.clean$Wind)[2])
(Wind.q3 <- fivenum(airquality.clean$Wind)[4])
Wind.iqr <- Wind.q3 - Wind.q1
Wind.min <- Wind.q1 - 1.5* Wind.iqr
Wind.max <- Wind.q3 + 1.5* Wind.iqr
airquality.clean$Wind <- ifelse(airquality.clean$Wind > Wind.max,
Wind.max,
airquality.clean$Wind)
boxplot(airquality.clean)
# 4. 결과 확인
par(mfrow=c(1,2))
hist(airquality$Ozone,xlim=c(0,200),ylim=c(0,50),breaks = 12, main="airquality")
hist(airquality.clean$Ozone,xlim=c(0,200),ylim=c(0,50),breaks = 6, main="preprocessed")
가운데는 왜 튀어나온것일까요?
.
.
.
.
.
.
.
.
.
.
.
.
.
.
.
.
NA 를 중위수로 대체했었지요.
'ADP (R)' 카테고리의 다른 글
[R example] cluster::silhouette (0) | 2022.10.09 |
---|---|
[TIP] RStudio 칼럼 편집 모드 (1) | 2022.10.08 |
[R example] curve (0) | 2022.10.07 |
IQR 기준 이상치 제거 방법 (0) | 2022.10.07 |
ADP 실기 26회 복기 (0) | 2022.10.04 |