728x90

전자상거래 배송 데이터

제품 배송 시간에 맞춰 배송되었는지 예측모델 만들기

학습용 데이터 (X_train, y_train)을 이용하여 배송 예측 모형을 만든 후, 이를 평가용 데이터(X_test)에 적용하여 얻은 예측 확률값을 다음과 같은 형식의 CSV파일로 생성하시오(제출한 모델의 성능은 ROC-AUC 평가지표에 따라 채점)

ID, Reached.on.Time_Y.N
4733,0.6
2040,0.8
5114,0.45
2361,0.23
5996,0.43
df_train = read.csv("C:/BigDataCertificationCourses-main/2nd/train.csv", header=T)
summary(df_train)

df_train$Warehouse_block = as.factor(df_train$Warehouse_block)
df_train$Mode_of_Shipment = as.factor(df_train$Mode_of_Shipment)
df_train$Product_importancet = as.factor(df_train$Product_importance)
df_train$Gender = as.factor(df_train$Gender)
df_train$Reached.on.Time_Y.N = as.factor(df_train$Reached.on.Time_Y.N)

library(caret)
normal = preProcess(df_train[,c(5,6,7,10,11)])
df_train = predict(normal, df_train)
train.idx = createDataPartition(df_train$ID,
                                p = .8)
train = df_train[train.idx$Resample1, ]
test = df_train[-train.idx$Resample1, ]

md.lm = glm(Reached.on.Time_Y.N ~ .,
           data = train[, -1],
           family='binomial')
md.lm = step(md.lm)
library(randomForest)
md.rf = randomForest(Reached.on.Time_Y.N ~ .,
                     data = train[, -1])
library(e1071)
md.svm = svm(Reached.on.Time_Y.N ~ .,
             data = train[, -1])

pred.lm = predict(md.lm, newdata = test[, -1], type = "response")
pred.lm = as.factor(round(pred.lm, 0)) #회귀함수 값처리
pred.rf = predict(md.rf, newdata = test[, -1])
pred.svm = predict(md.svm, newdata = test[, -1])

confusionMatrix(test$Reached.on.Time_Y.N, pred.lm)
ModelMetrics::rmse(test$Reached.on.Time_Y.N, pred.lm)
confusionMatrix(test$Reached.on.Time_Y.N, pred.rf)
ModelMetrics::rmse(test$Reached.on.Time_Y.N, pred.rf)
confusionMatrix(test$Reached.on.Time_Y.N, pred.svm)
ModelMetrics::rmse(test$Reached.on.Time_Y.N, pred.svm)

#로지스틱 함수가 제일 좋은 정확도를 가지고 있음
final = cbind(test$ID, as.data.frame(pred.lm))
colnames(final) = c("ID", "Reached.on.Time_Y.N")
final
728x90