Archive

Posts Tagged ‘support vector machine’

mctune: multicore hyperparameter tuning in R on the example of SVM car detection

April 20, 2014 1 comment

mctune

In Machine Learning (ML) tasks finding good hyperparameters for machine learning models is critical (hyperparameter optimization). In R there exist some packages containing routines doing that for you using grid search (constructing and testing all possible parameters as a grid, e.g. in David Meyer’s e1071 package).

Besides the very good routines already contained in those packages a while ago I liked to have multicore hyperparameter tuning and needed true positive rate (true match rate) and true negative rate (true non match rate) alongside overall error rate. Therefore I modified the tune method of the e1071 package to a) use multicore processing and b) return confusion matrizes showing average true positive / true negative rates per sample class and parameter setting.

The script inherits it’s GPL-2 licence from it’s original version. I modified tune specifically for my own needs when tuning SVM hyperparameters, therefore did not validate it for correctness or how it will work with other tune-able functions (e.g. KNN, random forest).

# modified version of "tune" from package "e1071"
# licence: GPL-2
# details: http://cran.r-project.org/web/packages/e1071/index.html
mctune <- function(method, train.x, train.y = NULL, data = list(),
                 validation.x = NULL, validation.y = NULL,
                 ranges = NULL, predict.func = predict,
                 tunecontrol = tune.control(),
                 mc.control=NULL,
                 confusionmatrizes=F,
                 ...
) {
  call <- match.call()

  require('plyr')
  require('parallel')

  ## internal helper functions
  resp <- function(formula, data) {
    model.response(model.frame(formula, data))
  }

  classAgreement2 <- function (tab) {
    n <- sum(tab)
    # correct classification rate
    if (!is.null(dimnames(tab))) {
      lev <- intersect(colnames(tab), rownames(tab))
      d <- diag(tab[lev, lev])
      p0 <- sum(d) / n
    } else {
      m <- min(dim(tab))
      d <- diag(tab[1:m, 1:m])
      p0 <- sum(d) / n
    }
    # confusion matrizes
    if(!confusionmatrizes) {
      list(p0=p0)
    } else if(is.null(dimnames(tab))) {
      stop('tables without dimension names are not allowed when generating confusionmatrizes.')
    }
    else {
      # generate confusion matrix for each class
      classnames <- unique(unlist(dimnames(tab)))
      truepositives <-  unlist(lapply(classnames, function(positiveclassname) { sum(d[positiveclassname])}))
      falsepositives <- unlist(lapply(classnames, function(positiveclassname) { sum(tab[positiveclassname,])-tab[positiveclassname,positiveclassname]}))
      falsenegatives <- unlist(lapply(classnames, function(positiveclassname) { sum(tab[,positiveclassname])-tab[positiveclassname,positiveclassname]}))
      truenegatives <- mapply(FUN=function(tp,fn,fp){ sum(tab)-tp-fn-fp }, truepositives, falsenegatives, falsepositives)
      confusions <- data.frame(classnames, truepositives, truenegatives, falsepositives, falsenegatives, row.names=NULL)
      colnames(confusions) <- c('class', 'tp', 'tn', 'fp', 'fn')
      list(p0=p0, confusions=confusions)
    }
  }

  ## parameter handling
  if (tunecontrol$sampling == "cross")
    validation.x <- validation.y <- NULL
  useFormula <- is.null(train.y)
  if (useFormula && (is.null(data) || length(data) == 0))
    data <- model.frame(train.x)
  if (is.vector(train.x)) train.x <- t(t(train.x))
  if (is.data.frame(train.y))
    train.y <- as.matrix(train.y)

  ## prepare training indices
  if (!is.null(validation.x)) tunecontrol$fix <- 1
  n <- (nrow(if (useFormula) data
            else train.x))
  perm.ind <- sample(n)
  if (tunecontrol$sampling == "cross") {
    if (tunecontrol$cross > n)
      stop(sQuote("cross"), " must not exceed sampling size!")
    if (tunecontrol$cross == 1)
      stop(sQuote("cross"), " must be greater than 1!")
  }
  train.ind <- (if (tunecontrol$sampling == "cross")
                  tapply(1:n, cut(1:n, breaks = tunecontrol$cross), function(x) perm.ind[-x])
                else if (tunecontrol$sampling == "fix")
                  list(perm.ind[1:trunc(n * tunecontrol$fix)])
                else
                ## bootstrap
                  lapply(1:tunecontrol$nboot,
                         function(x) sample(n, n * tunecontrol$boot.size, replace = TRUE))
  )

  ## find best model
  parameters <- (if(is.null(ranges))
                  data.frame(dummyparameter = 0)
                else
                  expand.grid(ranges))
  p <- nrow(parameters)
  if (!is.logical(tunecontrol$random)) {
    if (tunecontrol$random < 1)
      stop("random must be a strictly positive integer")
    if (tunecontrol$random > p) tunecontrol$random <- p
    parameters <- parameters[sample(1:p, tunecontrol$random),]
  }

  ## - loop over all models
  # concatenate arbitrary mc-arguments with explicit X and FUN arguments
  train_results<-do.call(what="mclapply", args=c(mc.control, list(X=1:p, ..., FUN=function(para.set) {
    sampling.errors <- c()
    sampling.confusions <- c()

    ## - loop over all training samples
    for (sample in 1:length(train.ind)) {
      repeat.errors <- c()
      repeat.confusions <- c()

      ## - repeat training `nrepeat' times
      for (reps in 1:tunecontrol$nrepeat) {

        ## train one model
        pars <- if (is.null(ranges))
          NULL
        else
          lapply(parameters[para.set,,drop = FALSE], unlist)

        model <- if (useFormula)
          do.call(method, c(list(train.x,
                                 data = data,
                                 subset = train.ind[[sample]]),
                            pars, list(...)
          )
          )
        else
          do.call(method, c(list(train.x[train.ind[[sample]],],
                                 y = train.y[train.ind[[sample]]]),
                            pars, list(...)
          )
          )

        ## predict validation set
        pred <- predict.func(model,
                             if (!is.null(validation.x))
                               validation.x
                             else if (useFormula)
                               data[-train.ind[[sample]],,drop = FALSE]
                             else if (inherits(train.x, "matrix.csr"))
                               train.x[-train.ind[[sample]],]
                             else
                               train.x[-train.ind[[sample]],,drop = FALSE]
        )

        ## compute performance measure
        true.y <- if (!is.null(validation.y))
          validation.y
        else if (useFormula) {
          if (!is.null(validation.x))
            resp(train.x, validation.x)
          else
            resp(train.x, data[-train.ind[[sample]],])
        } else
          train.y[-train.ind[[sample]]]

        if (is.null(true.y)) true.y <- rep(TRUE, length(pred))

        if (!is.null(tunecontrol$error.fun))
          repeat.errors[reps] <- tunecontrol$error.fun(true.y, pred)
        else if ((is.logical(true.y) || is.factor(true.y)) && (is.logical(pred) || is.factor(pred) || is.character(pred))) { ## classification error
          l <- classAgreement2(table(pred, true.y))
          repeat.errors[reps] <- (1 - l$p0) # wrong classification rate
          if(confusionmatrizes) {
            repeat.confusions[[reps]] <- l$confusions
          }
        } else if (is.numeric(true.y) && is.numeric(pred)) ## mean squared error
          repeat.errors[reps] <- crossprod(pred - true.y) / length(pred)
        else
          stop("Dependent variable has wrong type!")
      }
      sampling.errors[sample] <- tunecontrol$repeat.aggregate(repeat.errors)
      # TODO potentially implement separate aggregation of tp tn fp fn values. currently those are taken with correlate to the least error.
      if(confusionmatrizes) {
        sampling.confusions[[sample]] <- repeat.confusions[repeat.errors == sampling.errors[sample]][[1]]
      }
    }
    # TODO potentially implement separate aggregation of tp tn fp fn values. currently uses the same as for error / variance aggregation
    if(!confusionmatrizes) {
      list(
        model.error=tunecontrol$sampling.aggregate(sampling.errors),
        model.variance=tunecontrol$sampling.dispersion(sampling.errors))
    } else {
      # create one confusion data frame
      confusions <- ldply(sampling.confusions, data.frame)
      # calculate aggregate / disperse values per class
      confusions <- ldply(lapply(X=split(confusions, confusions$class), FUN=function(classdf) {
        class=unique(classdf$class)
        # only take numeric values
        classdf[,c('tp','tn','fp','fn')]
        # calculate aggregate / disperse values for this class
        aggregated <- apply(X=classdf[,c('tp','tn','fp','fn')], MAR=2, FUN=tunecontrol$sampling.aggregate)
        dispersions <- apply(X=classdf[,c('tp','tn','fp','fn')], MAR=2, FUN=tunecontrol$sampling.dispersion)
        # make 1 row dataframe out of it (combine rows later with outer ldply)
        t(data.frame(c(value=aggregated,dispersion=dispersions)))
      }), data.frame)
      colnames(confusions) <- c('class', 'tp.value', 'tn.value', 'fp.value', 'fn.value', 'tp.dispersion', 'tn.dispersion', 'fp.dispersion', 'fn.dispersion')
      # calculate mean confusion matrix values (mean of all classes) for best model
      confusions.mean <- data.frame(t(apply(X=confusions[,c('tp.value','tn.value','fp.value','fn.value','tp.dispersion','tn.dispersion','fp.dispersion','fn.dispersion')], MAR=2, FUN=mean)))
      colnames(confusions.mean) <- c('tp.value', 'tn.value', 'fp.value', 'fn.value', 'tp.dispersion', 'tn.dispersion', 'fp.dispersion', 'fn.dispersion')
      list(
        model.error=tunecontrol$sampling.aggregate(sampling.errors),
        model.variance=tunecontrol$sampling.dispersion(sampling.errors),
        model.confusions=confusions,
        model.confusions.mean=confusions.mean
      )
    }
  })))
#   print('mctune: mclapply done.')
#   print(train_results)
  model.errors <- unlist(lapply(train_results,function(x)x$model.error))
  model.variances <- unlist(lapply(train_results,function(x)x$model.variance))
  if(confusionmatrizes){
    model.confusions <- lapply(train_results,function(x)x$model.confusions)
    model.confusions.mean <- ldply(lapply(train_results,function(x)x$model.confusions.mean), data.frame)
  }

  ## return results
  best <- which.min(model.errors)
  pars <- if (is.null(ranges))
    NULL
  else
    lapply(parameters[best,,drop = FALSE], unlist)
  structure(list(best.parameters  = parameters[best,,drop = FALSE],
                 best.performance = model.errors[best],
                 method           = if (!is.character(method))
                   deparse(substitute(method)) else method,
                 nparcomb         = nrow(parameters),
                 train.ind        = train.ind,
                 sampling         = switch(tunecontrol$sampling,
                                           fix = "fixed training/validation set",
                                           bootstrap = "bootstrapping",
                                           cross = if (tunecontrol$cross == n) "leave-one-out" else
                                             paste(tunecontrol$cross,"-fold cross validation", sep="")
                 ),
                 performances     = if (tunecontrol$performances) cbind(parameters, error = model.errors, dispersion = model.variances),
                 confusionmatrizes = if (confusionmatrizes) model.confusions,
                 confusionmatrizes.mean = if(confusionmatrizes) model.confusions.mean,
                 best.confusionmatrizes = if(confusionmatrizes) model.confusions[[best]],
                 best.confusionmatrizes.mean = if(confusionmatrizes) model.confusions.mean[best,],
                 best.model       = if (tunecontrol$best.model) {
                   modeltmp <- if (useFormula)
                     do.call(method, c(list(train.x, data = data),
                                       pars, list(...)))
                   else
                     do.call(method, c(list(x = train.x,
                                            y = train.y),
                                       pars, list(...)))
                   call[[1]] <- as.symbol("best.tune")
                   modeltmp$call <- call
                   modeltmp
                 }
  ),
            class = "tune"
  )
}

Additional parameters

mc.control

A list of parameters that go to mclapply from the parallel package. Example: mc.control=list(mc.cores=3, mc.preschedule=F)

confusionmatrizes

Binary flag indicating if confusion matrizes should be generated or not.

Additional return values

tune$confusionmatrizes

List of confusion matrizes. The matrizes are sorted the same way as tune$performances is. Each matrix is a data frame listing the sample classes (class), it’s corresponding absolute average values (value) and standard deviation (dispersion) for true positives (TP), true negatives (TN), false positives (FP) and false negatives (FN). An pre-drawn example of such a confusionmatrix from our car detection below could look like this (where “yes” means “car” and “no” means “no car”):

class tp.value tn.value fp.value fn.value tp.dispersion tn.dispersion fp.dispersion fn.dispersion
   no     49.5     53.4      1.6      0.5      5.212165      5.621388     0.8432740     0.5270463
  yes     53.4     49.5      0.5      1.6      5.621388      5.212165     0.5270463     0.8432740

In above example for the “yes” class the tp.value=53.4 means that on average 53.4 samples were correctly classified as “yes” using the corresponding parameter setting.

Car detection

As ML example we do a car detection. The task is to decide if a given image contains or not contains a car (therefore leaving out searching mechanisms like sliding window). As data source we use the UIUC Image Database for Car Detection. In order to reduce the amount of features (pixels) and “blur” images (which discards unnecessary details and likely even increases recognition probabilities) I resized the images from original 100×40 to 50×20 pixels. For a human it’s still easy to decide if an image shows a car or not at this scale:

No car 1

No car 1

No car 2

No car 2

No car 3

No car 3

Car 1

Car 1

Car 2

Car 2

Car 3

Car 3

 

 

As ML model we use a support vector machine with radial (Gaussian) kernel, which leaves us with the cost and gamma parameter to be tuned – which we do using a grid search and 10 fold cross validation.

# load data
print('loading data...')
library('png')
car_neg <- lapply(dir(pattern="neg.*png"),readPNG)
car_pos <- lapply(dir(pattern="pos.*png"),readPNG)
data <- c(car_neg,car_pos)
labels <- factor(c(rep('no',length(car_neg)), rep('yes', length(car_pos))))
# put data to dataframe
data <- t(array(unlist(data),dim=c(length(data[[1]]),length(data))))
data<-data.frame(data)
# look at a car
# image(array(as.numeric(data[600,]),dim=c(20,50)))
# look at correlation between first pixels
# plot(data.frame(data,labels)[,1:8],pch='.') 

library('e1071')
source('mctune.R')
t<-mctune(confusionmatrizes=T,
          mc.control=list(mc.cores=3, mc.preschedule=F),
          method=svm,
          ranges=list(type='C',
                      kernel='radial',
                      gamma=3^(-10:-4),
                      cost=3^(-8:8)),
          train.x=data,
          train.y=labels,
          validation.x=NULL, #validation.x and .y are only used with tune.control sampling='boot' and 'fixed'.
          validation.y=NULL,
          tunecontrol=tune.control(sampling='cross',
                                   cross=10,
                                   performances=T,
                                   nrepeat=1,
                                   best.model=T))
# extract FMR FNMR from our positive class
p <- lapply(X=t$confusionmatrizes, FUN=function(x){
  p <- x[x$class=='yes',]
  p[-1]
})
p <- ldply(p)
t$performances <- data.frame(t$performances, p)
t$performances$FMR <- t$performances$fp.value / (t$performances$tn.value + t$performances$fp.value)
t$performances$FNMR <- t$performances$fn.value / (t$performances$tp.value + t$performances$fn.value)
# print list of errors
t$performances[with(t$performances, order(error)),]

# different plots of parameters and errors
library('scatterplot3d')
scatterplot3d(t$performances$cost, t$performances$gamma, t$performances$error,log='xy', type='h', pch=1, color='red')
plot(t$performances[,3:5],log='xy',pch=4)
# paramters and errors: from best to worst
plot(t$performances[with(t$performances, order(error)),]$cost,log='y', col='blue')
points(t$performances[with(t$performances, order(error)),]$gamma, col='green')
points(t$performances[with(t$performances, order(error)),]$error, col='black')
# points(t$performances[with(t$performances, order(error)),]$FMR, col='red')
# points(t$performances[with(t$performances, order(error)),]$FNMR, col='orange')

Using this setup with the first search we obtain an error rate of about 0.02 for gamma 10^-3 and cost >= 3 (this indicates the search area for the next step). The corresponding false match rate (FMR) and false non match rate (FNMR) are in the range of 0.01 and 0.029.

Image classification using SVMs in R

February 24, 2013 6 comments

Recently I did some Support Vector Machine (SVM) tests in R (statistical language with functional parts for rapid prototyping and data analysis — somehow similar to Matlab, but open source ;)) for my current face recognition projects. To get my SVMs up and running in R, using image data as in- and output, I wrote a small demo script for classifying images. As test data I used 2 classes of images (lines from left top to right bottom and lines from left bottom to right top), with 10 samples each — like these:

ImageImageImageImageImageImage
The complete image set is available here.

For SVM classification simple train and test sets get used — for more sophisticated problems n-fold cross validation for searching good parameter settings is recommended instead. For everybody who did not yet work with SVMs, I’d recommend reading something about how to start with “good” SVM classification, like the pretty short and easy to read “A Practical Guide to Support Vector Classification” from Chih-Wei Hsu, Chih-Chung Chang, and Chih-Jen Lin (the LIBSVM-inventors).

Update: added parallel processing using parallel and mclapply for loading image data (for purpose of demonstration only, loading 10 images in parallel does not make a big difference ;)).

print('starting svm demo...')

library('png')
library('e1071')
library('parallel')

# load img data
folder<-'.'
file_list <- dir(folder, pattern="png")
data <- mclapply(file_list, readPNG, mc.cores=2)
# extract subject id + img nr from names
subject_ids <- lapply(file_list, function(file_name) as.numeric(unlist(strsplit(file_name, "_"))[1]))
# rename subject id's to c1 and c2 for more clear displaying of results
subject_ids[subject_ids==0]='c1'
subject_ids[subject_ids!='c1']='c2'
img_ids <- lapply(file_list, function(file_name) as.numeric(unlist(strsplit(unlist(strsplit(file_name, "_"))[2], "\\."))[1]))

# specify which data should be used as test and train by the img nrs
train_test_border <- 7
# split data into train and test, and bring into array form to feed to svm
train_in <- t(array(unlist(data[img_ids < train_test_border]), dim=c(length(unlist(data[1])),sum(img_ids < train_test_border))))
train_out <- unlist(subject_ids[img_ids < train_test_border])
test_in <- t(array(unlist(data[img_ids >= train_test_border]), dim=c(length(unlist(data[1])),sum(img_ids >= train_test_border))))
test_out <- unlist(subject_ids[img_ids >= train_test_border])

# train svm - try out different kernels + settings here
svm_model <- svm(train_in, train_out, type='C', kernel='linear')

# evaluate svm
p <- predict(svm_model, train_in)
print(p)
print(table(p, train_out))
p <- predict(svm_model, test_in)
print(p)
print(table(p, test_out))

print('svm demo done!')