Wednesday, 22 May 2019

KNN


prc <- read.csv("Prostate_Cancer.csv",stringsAsFactors = FALSE)   

stringsAsFactors = FALSE   #This command helps to convert every character vector to a factor wherever it makes sense.

str(prc)    #We use this command to see whether the data is structured or not.

prc <- prc[-1]  #removes the first variable(id) from the data set.

table(prc$diagnosis_result)  # it helps us to get the numbers of patients

prc$diagnosis <- factor(prc$diagnosis_result, levels = c("B", "M"), labels = c("Benign", "Malignant"))

round(prop.table(table(prc$diagnosis)) * 100, digits = 1)  # it gives the result in the percentage form rounded of to 1 decimal place( and so it’s digits = 1)

normalize <- function(x) {
  return ((x - min(x)) / (max(x) - min(x))) }

prc_n <- as.data.frame(lapply(prc[2:9], normalize))
summary(prc_n$radius)
prc_train <- prc_n[1:65,]
prc_test <- prc_n[66:100,]
prc_train_labels <- prc[1:65, 1]
prc_test_labels <- prc[66:100, 1]   #This code takes the diagnosis factor in column 1 of the prc data frame and on turn creates prc_train_labels and prc_test_labels data frame.

library(class)
prc_test_pred <- knn(train = prc_train, test = prc_test,cl = prc_train_labels, k=10)

#install.packages("gmodels")
library(gmodels)
CrossTable(x=prc_test_labels,y=prc_test_pred,prop.chisq = FALSE)

# Run for K value from 1 to 10, to find the best K value.

KVALUE <- seq(from=1,to=10,by=1)
for (K in 1:length(KVALUE)){
  prc_test_pred <- knn(train = prc_train, test = prc_test,cl = prc_train_labels, k=KVALUE[K])
  #prédiction check
  CrossTable(x=prc_test_labels,y=prc_test_pred,prop.chisq = FALSE)
}

No comments:

Post a Comment