728x90

여행 보험 패키지 상품을 구매(값이 1인)할 확률 값을 구하시오

  • 예측할 값(y): TravelInsurance (여행보험 패키지를 구매 했는지 여부 0:구매안함, 1:구매)
  • 평가: roc-auc 평가지표
  • data: t2-1-train.csv, t2-1-test.csv
  • 제출 형식
id,TravelInsurance
0,0.3
1,0.48
2,0.3
3,0.83

 

df_train = read.csv("C:/BigDataCertificationCourses-main/3rd/t2-1-train.csv", header=T)
df_test = read.csv("C:/BigDataCertificationCourses-main/3rd/t2-1-test.csv", header=T)
head(df_train, 10)
df_test_id = df_test[, 1]
df_train = df_train[, -1]
df_test = df_test[, -1]
summary(df_train)
summary(df_test)
typeof(df_train)

df_train = as.data.frame(df_train)
df_test = as.data.frame(df_test)
table(is.na(df_train$TravelInsurance))

#결측치 보충
mean_AnnualIncome = mean(na.omit(df_train$AnnualIncome))
df_train$AnnualIncome[is.na(df_train$AnnualIncome)] = mean_AnnualIncome
summary(df_train$AnnualIncome)

df_train$TravelInsurance = as.factor(df_train$TravelInsurance)

install.packages("randomForest")
library(randomForest)
md.rf = randomForest(TravelInsurance ~.,
                        data = df_train)
pred.rf = predict(md.rf,
                  newdata = df_test,
                  type = "prob") #여기서 중요한 type은 값이 1일 확률일 prob로 설정한다

result = cbind(df_test[,1], as.data.frame(pred.rf[,2]))
colnames(result) = c("id", "TravelInsurance")
result
728x90