C# PLS2 Cross Validation Example

[TOC]

using System;
using System.IO;

using CenterSpace.NMath.Core;
using CenterSpace.NMath.Stats;


namespace CenterSpace.NMath.Stats.Examples.CSharp
{
  /// <summary>
  /// This .NET example in C# shows how to use the CrossValidation classes to 
  /// find the optimal number of components for a Partial Least Squares (PLS) 
  /// calculation.
  /// For each number of components we will perform a K-fold cross validation.
  /// In K-fold cross validation the data set is divided into k subsets, and 
  /// the holdout method is repeated k times. Each time, one of the k subsets 
  /// is used as the test set and the other k-1 subsets are put together to 
  /// form a training set. Then the average error across all k trials is computed.
  /// The optimal number of components will then be the number of components for
  /// which this average error is a minimum.
  /// </summary>
  class PLS2CrossValidationExample
  {

    static void Main(string[] args)
    {
      string yDatafilename = "..\\..\\chemometricY.dat";
      string xDatafilename = "..\\..\\chemometricX.dat";
      StreamReader xDataStream;
      StreamReader yDataStream;
      try
      {
        xDataStream = new StreamReader(xDatafilename);
        yDataStream = new StreamReader(yDatafilename);
      }
      catch (FileNotFoundException e)
      {
        string msg = string.Format("Could not find data file {0}, {1}.", xDatafilename, yDatafilename);
        msg += Environment.NewLine;
        msg += e.Message;
        msg += Environment.NewLine;
        Console.WriteLine(msg);
        return;
      }

      DoubleMatrix spectralData = new DoubleMatrix(xDataStream);
      DoubleMatrix concentrationData = new DoubleMatrix(yDataStream);

      int numDependentVars = concentrationData.Cols;
      int numIndependentVars = spectralData.Cols;
      int numSamples = spectralData.Rows;
      int k = 6;

      // The CrossValidation class needs the full set of data, a way to generate
      // subsets of the data and a PLS calculator object. The subset generator 
      // is specified by an instance of the ICrossValidationSubets interface.
      KFoldsSubsets subsetGenerator = new KFoldsSubsets(k);

      // Construct a PLS2 cross validation object that uses SIMPLS algorithm to
      // calculate the partial least squares models.
      PLS2SimplsAlgorithm calculator = new PLS2SimplsAlgorithm();
      PLS2CrossValidation cv = new PLS2CrossValidation(calculator, subsetGenerator);

      // Now for each number of components perform cross validation and record the 
      // minimum average Mean Square Error and the number of components at which
      // it is achieved. 
      int optimalNumComponents = -1;
      double minMse = Double.MaxValue;

      Console.WriteLine();
      Console.WriteLine("Components\tMean Square Error");
      Console.WriteLine("=================================\n\n");

      for (int numComponents = 1; numComponents < numIndependentVars - 1; ++numComponents)
      {
        cv.DoCrossValidation(spectralData, concentrationData, numComponents);
        if (!calculator.IsGood)
        {
          Console.WriteLine("Calcuation with {0} components is not good. Message:", numComponents);
          Console.WriteLine(calculator.Message);
        }
        double mse = cv.AverageMeanSqrError.TwoNorm();
        Console.WriteLine(numComponents + "\t\t" + mse);
        if (mse < minMse)
        {
          minMse = mse;
          optimalNumComponents = numComponents;
        }
      }
      Console.WriteLine("\n\nOptimal number of components = " + optimalNumComponents);
      Console.WriteLine("Minimum MSE = " + minMse);

      Console.WriteLine();
      Console.WriteLine("Press Enter Key");
      Console.Read();
    }
  }
}

[TOC]