C# NMF Consensus Matrix Example

← All NMath Stats Code Examples

 

using System;

using CenterSpace.NMath.Core;
using CenterSpace.NMath.Stats;
using System.IO;

namespace CenterSpace.NMath.Stats.Examples.CSharp
{
  /// <summary>
  /// A .NET example in C# showing how to compute a consensus matrix averaging different MNF clusterings.
  /// </summary>
  /// <remarks>
  /// A Nonnegative Matrix Factorization (NMF) is an approximate factorization
  /// of a positive matrix v into a product of two matrices w and h:
  /// v ~ wh
  /// This factorization can by used to group, or cluster, the columns of v
  /// (the columns of v are usually refered to as "samples" ). NMF uses an
  /// iterative algorithm with random starting values for w and h. This, coupled
  /// with the fact that the factorization is not unique, means that if you cluster
  /// the columns of v using an NMF cluster several different times, you may get several
  /// different clusterings. The NMF consensus matrix is a way to average 
  /// the possibly different clusterings, and is computed using the following process:
  /// 
  /// Cluster the columns of v using NMF n times. Each NMF clustering will yield 
  /// a "connectivity matrix". The connectivity matrix is a symmetric matrix 
  /// whose i, jth entry is 1 if columns i and j of v were clustered together,
  /// and 0 if they were not. The "consensus matrix" is also a symmetric matrix
  /// whose i, jth entry is formed by taking the average of the i, jth entries of
  /// the n connectivity matrices. 
  ///
  /// It is clear that each i, jth entry of the consensus matrix has a value between 0
  /// (columns i and j were not clustered together on any of the n runs) and 1 (columns
  /// i and j were clustered together on all n runs). Thus the i, jth entry of a 
  /// consensus matrix may be considered, in some sense, a "probability" that columns
  /// i and j belong to the same cluster. 
  /// A consensus matrix C may also used to perform a hierarchical clustering of the 
  /// columns of v by using as the distance function:
  ///
  /// distance between columns i and j = 1.0 - C[i,j]
  ///
  /// This is demonstrated in the example below.
  /// </remarks>
  class NMFConsensusMatrixExample
  {

    static void Main( string[] args )
    {
      // Read in some data..
      DataFrame data = DataFrame.Load( "nmf_data.dat", true, true, "\t", true ); 
     
      // Extract the data as a DoubleMatrix.
      DoubleMatrix v = data.ToDoubleMatrix();

      // Set the order of the NMF (this is the number of columns in w, where
      // v ~ wh
      int k = 3;

      // Set the number of runs or connectivity matrices to use to form the 
      // consensus matrix.
      int numberOfRuns = 70;

      // Construct a consensus matrix using the "divergence" update
      // algorithm.
      var consensusMatrix =
        new NMFConsensusMatrix<NMFDivergenceUpdate>( v, data.ColumnHeaders, k, numberOfRuns );

      Console.WriteLine();

      // Print out the number of runs in which the NMF algorithm actually converged to an answer, and the 
      // resulting consensus matrix.
      Console.WriteLine( "{0} runs out of {1} converged.", consensusMatrix.NumberOfConvergedRuns, numberOfRuns );
      Console.WriteLine();
      Console.WriteLine( "Consensus Matrix:" );
      Console.WriteLine( consensusMatrix.ToTabDelimited( "G3" ) );

      // Let's look at the first column and for each successive column print out the 
      // "probability" that they are clustered together (we'll use the column
      // names from the data frame instead of column numbers).
      string label = consensusMatrix.Labels[0];
      Console.WriteLine();
      for ( int j = 1; j < consensusMatrix.Order; j++ )
      {
        Console.WriteLine( "The \"probability\" that {0} is clustered with {1} is {2}",
          label, consensusMatrix.Labels[j], consensusMatrix[0, j] );
      }

      // Perform a hierarchical cluster analysis using the consensus matrix 
      // to define the distance function as described in the class description
      // above.

      // The cluster analysis class wants to cluster the rows of a matrix. Since we 
      // are essentially clustering a bunch of column numbers, we'll provide a matrix
      // with one column and n rows where n is the number of columns of v (and the
      // order of of the consensus matrix). The column will contain the numbers 0
      // to n - 1 (basically, we're just clustering the numbers 0,...,n - 1).
      var itemNumbers = new DoubleMatrix( consensusMatrix.Order, 1, 0, 1 );

      // The distance function object holds the consensus matrix C and returns the distance
      // between i and j as 1.0 - C[i,j]
      var distanceFunctionObject = new ConsensusMatrixDistance( consensusMatrix );
      var clusterAnalysisDist = new Distance.Function( distanceFunctionObject.CaDistance );
      var ca = new ClusterAnalysis( itemNumbers, clusterAnalysisDist );

      // Form three clusters using the cluster analysis cut tree function and print them out.
      ClusterSet clusters = ca.CutTree( 3 );
      Console.WriteLine();
      for ( int clusterNumber = 0; clusterNumber < clusters.NumberOfClusters; clusterNumber++ )
      {
        int[] members = clusters.Cluster( clusterNumber );
        Console.Write( "Cluster number {0} contains: ", clusterNumber );
        for ( int i = 0; i < members.Length; i++ )
        {
          Console.Write( "{0} ", consensusMatrix.Labels[members[i]] );
        }
        Console.WriteLine();
      }

      Console.WriteLine();
      Console.WriteLine( "Press Enter Key" );
      Console.Read();
    }
  }

  public class ConsensusMatrixDistance
  {
    private ConnectivityMatrix consensusMatrix;

    public ConsensusMatrixDistance( ConnectivityMatrix conn )
    {
      consensusMatrix = conn;
    }

    public double CaDistance( DoubleVector data1, DoubleVector data2 )
    {
      int i = (int) data1[0];
      int j = (int) data2[0];
      return 1.0 - consensusMatrix[i, j];
    }
  }
}

← All NMath Stats Code Examples
Top