A kódoláshoz ChatGPT-t is használtunk

adatelőkészítés (benne numerikus változók kiválasztása, standardizálás, mintavétel-az eredetiben messze túl sok megfigyelés volt, még így is nagyon lassú volt)

set.seed(2024)
library(e1071)
## Warning: package 'e1071' was built under R version 4.3.3
library(caret)
## Warning: package 'caret' was built under R version 4.3.3
## Loading required package: ggplot2
## Loading required package: lattice
library(dplyr)
## 
## Attaching package: 'dplyr'
## The following objects are masked from 'package:stats':
## 
##     filter, lag
## The following objects are masked from 'package:base':
## 
##     intersect, setdiff, setequal, union
library(ggplot2)
library(GenSA)
## Warning: package 'GenSA' was built under R version 4.3.3
loan_data <- read.csv("C:/Users/BF/Downloads/loan_data.csv")
szurt<-loan_data[, c(1,4,5,7,9:12,14)]
random_indices <- sample(1:nrow(szurt), size = 3000, replace = FALSE)
sampled_df <- szurt[random_indices, ]
szurt_st<-data.frame(scale(sampled_df[,1:8]),sampled_df$loan_status)
summary(szurt_st$sampled_df.loan_status)
##    Min. 1st Qu.  Median    Mean 3rd Qu.    Max. 
##  0.0000  0.0000  0.0000  0.2273  0.0000  1.0000
table(szurt_st$sampled_df.loan_status)
## 
##    0    1 
## 2318  682
cor(as.numeric(szurt_st$predicted_1),as.numeric(szurt_st$predicted_legjobb))
## [1] NA
szurt_st <- data.frame(szurt_st, y=ifelse(szurt_st$sampled_df.loan_status=="1", 1, -1))

averages <- sampled_df %>%
  group_by(loan_status) %>%
  summarise(across(everything(), mean, na.rm = TRUE))
## Warning: There was 1 warning in `summarise()`.
## ℹ In argument: `across(everything(), mean, na.rm = TRUE)`.
## ℹ In group 1: `loan_status = 0`.
## Caused by warning:
## ! The `...` argument of `across()` is deprecated as of dplyr 1.1.0.
## Supply arguments directly to `.fns` through an anonymous function instead.
## 
##   # Previously
##   across(a:b, mean, na.rm = TRUE)
## 
##   # Now
##   across(a:b, \(x) mean(x, na.rm = TRUE))
  1. modell: két magyarázóváltozóval hiperparaméterhangolás nélkül (a két változó kiválasztása előtt sok párosítást próbáltunk ki, hogy melyik két változóval a legszemléletesebb)
set.seed(2024)
svm1 <- svm(as.factor(y)~loan_int_rate+loan_percent_income, data=szurt_st, kernel="radial")
plot(svm1, loan_int_rate~loan_percent_income, data=szurt_st, grid=1000, svSymbol="o")

szurt_st$predicted_1<-predict(svm1,szurt_st)
table(szurt_st$predicted_1,szurt_st$sampled_df.loan_status)
##     
##         0    1
##   -1 2182  349
##   1   136  333
  1. modell még mindig hiperparaméter hangolás nélkül
set.seed(2024)
svm2 <- svm(as.factor(y)~person_age+person_income+person_emp_exp+loan_amnt+loan_int_rate+loan_percent_income+cb_person_cred_hist_length+credit_score, data=szurt_st, kernel="radial")
plot(svm2, loan_int_rate~loan_percent_income, data=szurt_st, grid=1000, svSymbol="o")

szurt_st$predicted_2<-predict(svm2,szurt_st)
table(szurt_st$predicted_2,szurt_st$sampled_df.loan_status)
##     
##         0    1
##   -1 2228  376
##   1    90  306

hiperparaméter hangolás 1

set.seed(2024)
keresztval <- trainControl(method="cv", number=10)
racs1 <- expand.grid(C=2^(0:4), sigma=2^(-4:4))
hang1 <- train(as.factor(y)~person_age+person_income+person_emp_exp+loan_amnt+loan_int_rate+loan_percent_income+cb_person_cred_hist_length+credit_score, data=szurt_st, tuneGrid=racs1, method="svmRadial")
eredmeny1 <- hang1$results
ggplot(eredmeny1, aes(x=as.factor(C), y=as.factor(sigma), fill=Accuracy))+geom_tile()

hiperparaméter hangolás 2

set.seed(2024)
racs2 <- expand.grid(C=2^(-3:1), sigma=2^(-6:-2))
hang2 <- train(as.factor(y)~person_age+person_income+person_emp_exp+loan_amnt+loan_int_rate+loan_percent_income+cb_person_cred_hist_length+credit_score, data=szurt_st, tuneGrid=racs2, method="svmRadial")
eredmeny2 <- hang2$results
ggplot(eredmeny2, aes(x=as.factor(C), y=as.factor(sigma), fill=Accuracy))+geom_tile()

hiperparaméter hangolás 3

set.seed(2024)
racs3 <- expand.grid(C=2^seq(from=-1, to=1, by=0.5), sigma=2^seq(from=-4, to=-1, by=0.5))
hang3 <- train(as.factor(y)~person_age+person_income+person_emp_exp+loan_amnt+loan_int_rate+loan_percent_income+cb_person_cred_hist_length+credit_score, data=szurt_st, tuneGrid=racs3, method="svmRadial")
eredmeny3 <- hang3$results
ggplot(eredmeny3, aes(x=as.factor(C), y=as.factor(sigma), fill=Accuracy))+geom_tile()

legjobb modell manuális hiperparaméter hangolás

set.seed(2024)
svm_legjobb<-svm(as.factor(y)~person_age+person_income+person_emp_exp+loan_amnt+loan_int_rate+loan_percent_income+cb_person_cred_hist_length+credit_score, data=szurt_st, kernel="radial", cost=1.4142136, sigma=0.125)
plot(svm_legjobb, loan_int_rate~loan_percent_income, data=szurt_st, grid=1000, svSymbol="o")

szurt_st$predicted_legjobb<-predict(svm_legjobb,szurt_st)
table(szurt_st$predicted_legjobb,szurt_st$sampled_df.loan_status)
##     
##         0    1
##   -1 2228  363
##   1    90  319

automatikus hiperparaméterhangolás

set.seed(2024)
svm_cv <- function(hiperparam){
  modell <- svm(as.factor(y)~person_age+person_income+person_emp_exp+loan_amnt+loan_int_rate+loan_percent_income+cb_person_cred_hist_length+credit_score, data=szurt_st, kernel="radial", cost=hiperparam[1], sigma=hiperparam[2], cross=10)
  return(-mean(modell$tot.accuracy))}
szim_hutes <- GenSA(fn=svm_cv, lower=c(0.01, 0.01), upper=c(10, 10), control=list(maxit=100)) 
szim_hutes$par
## [1] 4.7641574 0.7125827
-szim_hutes$value
## [1] 83.7

legjobb modell automatikus hiperparaméter hangolással

set.seed(2024)
svm_legjobb_2<-svm(as.factor(y)~person_age+person_income+person_emp_exp+loan_amnt+loan_int_rate+loan_percent_income+cb_person_cred_hist_length+credit_score, data=szurt_st, kernel="radial", cost=4.7641574, sigma=0.7125827)
plot(svm_legjobb_2, loan_int_rate~loan_percent_income, data=szurt_st, grid=1000, svSymbol="o")

szurt_st$predicted_legjobb_2<-predict(svm_legjobb_2,szurt_st)
table(szurt_st$predicted_legjobb_2,szurt_st$sampled_df.loan_status)
##     
##         0    1
##   -1 2237  342
##   1    81  340