C# NMF Consensus Matrix Example

[TOC]

using System;

using CenterSpace.NMath.Core;
using CenterSpace.NMath.Stats;

namespace CenterSpace.NMath.Stats.Examples.CSharp
{
  /// <summary>
  /// A .NET example in C# showing how to compute a consensus matrix averaging different MNF clusterings.
  /// </summary>
  /// <remarks>
  /// A Nonnegative Matrix Factorization (NMF) is an approximate factorization
  /// of a positive matrix v into a product of two matrices w and h:
  /// v ~ wh
  /// This factorization can by used to group, or cluster, the columns of v
  /// (the columns of v are usually refered to as "samples"). NMF uses an
  /// iterative algorithm with random starting values for w and h. This, coupled
  /// with the fact that the factorization is not unique, means that if you cluster
  /// the columns of v using an NMF cluster several different times, you may get several
  /// different clusterings. The NMF consensus matrix is a way to average 
  /// the possibly different clusterings, and is computed using the following process:
  /// 
  /// Cluster the columns of v using NMF n times. Each NMF clustering will yield 
  /// a "connectivity matrix". The connectivity matrix is a symmetric matrix 
  /// whose i, jth entry is 1 if columns i and j of v were clustered together,
  /// and 0 if they were not. The "consensus matrix" is also a symmetric matrix
  /// whose i, jth entry is formed by taking the average of the i, jth entries of
  /// the n connectivity matrices. 
  ///
  /// It is clear that each i, jth entry of the consensus matrix has a value between 0
  /// (columns i and j were not clustered together on any of the n runs) and 1 (columns
  /// i and j were clustered together on all n runs). Thus the i, jth entry of a 
  /// consensus matrix may be considered, in some sense, a "probability" that columns
  /// i and j belong to the same cluster. 
  /// A consensus matrix C may also used to perform a hierarchical clustering of the 
  /// columns of v by using as the distance function:
  ///
  /// distance between columns i and j = 1.0 - C[i,j]
  ///
  /// This is demonstrated in the example below.
  /// </remarks>
  class NMFConsensusMatrixExample
  {

    static void Main(string[] args)
    {
      Console.WriteLine();

      // Read in some data...
      string fileName = @"..\..\nmf_data.dat";
      DataFrame data = ReadDataFromFile(fileName);
      if (data == null) // Problem reading data!
      {
        return;
      }

      // Extract the data as a DoubleMatrix.
      DoubleMatrix v = data.ToDoubleMatrix();

      // Set the order of the NMF (this is the number of columns in w, where
      // v ~ wh
      int k = 3;

      // Set the number of runs or connectivity matrices to use to form the 
      // consensus matrix.
      int numberOfRuns = 70;

      // Construct a consensus matrix using the "divergence" update
      // algorithm.
      NMFConsensusMatrix<NMFDivergenceUpdate> consensusMatrix =
        new NMFConsensusMatrix<NMFDivergenceUpdate>(v, data.ColumnHeaders, k, numberOfRuns);

      // Print out the number of runs in which the NMF algorithm actually converged to an answer, and the 
      // resulting consensus matrix.
      Console.WriteLine("{0} runs out of {1} converged.", consensusMatrix.NumberOfConvergedRuns, numberOfRuns);
      Console.WriteLine();
      Console.WriteLine("Consensus Matrix:");
      Console.WriteLine(consensusMatrix.ToTabDelimited());

      // Let's look at the first column and for each successive column print out the 
      // "probability" that they are clustered together (we'll use the column
      // names from the data frame instead of column numbers).
      string label = consensusMatrix.Labels[0];
      Console.WriteLine();
      for (int j = 1; j < consensusMatrix.Order; j++)
      {
        Console.WriteLine("The \"probability\" that {0} is clustered with {1} is {2}",
          label, consensusMatrix.Labels[j], consensusMatrix[0, j]);
      }

      // Perform a hierarchical cluster analysis using the consensus matrix 
      // to define the distance function as described in the class description
      // above.

      // The cluster analysis class wants to cluster the rows of a matrix. Since we 
      // are essentially clustering a bunch of column numbers, we'll provide a matrix
      // with one column and n rows where n is the number of columns of v (and the
      // order of of the consensus matrix). The column will contain the numbers 0
      // to n - 1 (basically, we're just clustering the numbers 0,...,n - 1).
      DoubleMatrix itemNumbers = new DoubleMatrix(consensusMatrix.Order, 1, 0, 1);

      // The distance function object holds the consensus matrix C and returns the distance
      // between i and j as 1.0 - C[i,j]
      ConsensusMatrixDistance distanceFunctionObject = new ConsensusMatrixDistance(consensusMatrix);
      Distance.Function clusterAnalysisDist = new Distance.Function(distanceFunctionObject.CaDistance);
      ClusterAnalysis ca = new ClusterAnalysis(itemNumbers, clusterAnalysisDist);

      // Form three clusters using the cluster analysis cut tree function and print them out.
      ClusterSet clusters = ca.CutTree(3);
      Console.WriteLine();
      for (int clusterNumber = 0; clusterNumber < clusters.NumberOfClusters; clusterNumber++)
      {
        int[] members = clusters.Cluster(clusterNumber);
        Console.Write("Cluster number {0} contains: ", clusterNumber);
        for (int i = 0; i < members.Length; i++)
        {
          Console.Write("{0} ", consensusMatrix.Labels[members[i]]);
        }
        Console.WriteLine();
      }

      Console.WriteLine();
      Console.WriteLine("Press Enter Key");
      Console.Read();
    }

    static DataFrame ReadDataFromFile(string fileName)
    {
      DataFrame data;
      try
      {
        // Load the example data into a DataFrame
        data = DataFrame.Load(fileName, true, true, "\t", true);
      }
      catch (NMathException e)
      {
        string msg = string.Format("Could not load data file {0}.", fileName);
        msg += Environment.NewLine;
        msg += e.Message;
        msg += Environment.NewLine;
        msg += "Data file must have the same name as the example source ";
        msg += Environment.NewLine;
        msg += "file and be located three directories up from where the ";
        msg += Environment.NewLine;
        msg += "executable is running.";
        Console.WriteLine(msg);
        Console.WriteLine();
        return null;
      }
      return data;
    }
  }

  public class ConsensusMatrixDistance
  {
    private ConnectivityMatrix consensusMatrix;

    public ConsensusMatrixDistance(ConnectivityMatrix conn)
    {
      consensusMatrix = conn;
    }

    public double CaDistance(DoubleVector data1, DoubleVector data2)
    {
      int i = (int)data1[0];
      int j = (int)data2[0];
      return 1.0 - consensusMatrix[i, j];
    }
  }
}

[TOC]