[TOC]
using System;
using CenterSpace.NMath.Core;
using CenterSpace.NMath.Stats;
namespace CenterSpace.NMath.Stats.Examples.CSharp
{
/// <summary>
/// A .NET example in C# showing how to compute a consensus matrix averaging different MNF clusterings.
/// </summary>
/// <remarks>
/// A Nonnegative Matrix Factorization (NMF) is an approximate factorization
/// of a positive matrix v into a product of two matrices w and h:
/// v ~ wh
/// This factorization can by used to group, or cluster, the columns of v
/// (the columns of v are usually refered to as "samples"). NMF uses an
/// iterative algorithm with random starting values for w and h. This, coupled
/// with the fact that the factorization is not unique, means that if you cluster
/// the columns of v using an NMF cluster several different times, you may get several
/// different clusterings. The NMF consensus matrix is a way to average
/// the possibly different clusterings, and is computed using the following process:
///
/// Cluster the columns of v using NMF n times. Each NMF clustering will yield
/// a "connectivity matrix". The connectivity matrix is a symmetric matrix
/// whose i, jth entry is 1 if columns i and j of v were clustered together,
/// and 0 if they were not. The "consensus matrix" is also a symmetric matrix
/// whose i, jth entry is formed by taking the average of the i, jth entries of
/// the n connectivity matrices.
///
/// It is clear that each i, jth entry of the consensus matrix has a value between 0
/// (columns i and j were not clustered together on any of the n runs) and 1 (columns
/// i and j were clustered together on all n runs). Thus the i, jth entry of a
/// consensus matrix may be considered, in some sense, a "probability" that columns
/// i and j belong to the same cluster.
/// A consensus matrix C may also used to perform a hierarchical clustering of the
/// columns of v by using as the distance function:
///
/// distance between columns i and j = 1.0 - C[i,j]
///
/// This is demonstrated in the example below.
/// </remarks>
class NMFConsensusMatrixExample
{
static void Main(string[] args)
{
Console.WriteLine();
// Read in some data...
string fileName = @"..\..\nmf_data.dat";
DataFrame data = ReadDataFromFile(fileName);
if (data == null) // Problem reading data!
{
return;
}
// Extract the data as a DoubleMatrix.
DoubleMatrix v = data.ToDoubleMatrix();
// Set the order of the NMF (this is the number of columns in w, where
// v ~ wh
int k = 3;
// Set the number of runs or connectivity matrices to use to form the
// consensus matrix.
int numberOfRuns = 70;
// Construct a consensus matrix using the "divergence" update
// algorithm.
NMFConsensusMatrix<NMFDivergenceUpdate> consensusMatrix =
new NMFConsensusMatrix<NMFDivergenceUpdate>(v, data.ColumnHeaders, k, numberOfRuns);
// Print out the number of runs in which the NMF algorithm actually converged to an answer, and the
// resulting consensus matrix.
Console.WriteLine("{0} runs out of {1} converged.", consensusMatrix.NumberOfConvergedRuns, numberOfRuns);
Console.WriteLine();
Console.WriteLine("Consensus Matrix:");
Console.WriteLine(consensusMatrix.ToTabDelimited());
// Let's look at the first column and for each successive column print out the
// "probability" that they are clustered together (we'll use the column
// names from the data frame instead of column numbers).
string label = consensusMatrix.Labels[0];
Console.WriteLine();
for (int j = 1; j < consensusMatrix.Order; j++)
{
Console.WriteLine("The \"probability\" that {0} is clustered with {1} is {2}",
label, consensusMatrix.Labels[j], consensusMatrix[0, j]);
}
// Perform a hierarchical cluster analysis using the consensus matrix
// to define the distance function as described in the class description
// above.
// The cluster analysis class wants to cluster the rows of a matrix. Since we
// are essentially clustering a bunch of column numbers, we'll provide a matrix
// with one column and n rows where n is the number of columns of v (and the
// order of of the consensus matrix). The column will contain the numbers 0
// to n - 1 (basically, we're just clustering the numbers 0,...,n - 1).
DoubleMatrix itemNumbers = new DoubleMatrix(consensusMatrix.Order, 1, 0, 1);
// The distance function object holds the consensus matrix C and returns the distance
// between i and j as 1.0 - C[i,j]
ConsensusMatrixDistance distanceFunctionObject = new ConsensusMatrixDistance(consensusMatrix);
Distance.Function clusterAnalysisDist = new Distance.Function(distanceFunctionObject.CaDistance);
ClusterAnalysis ca = new ClusterAnalysis(itemNumbers, clusterAnalysisDist);
// Form three clusters using the cluster analysis cut tree function and print them out.
ClusterSet clusters = ca.CutTree(3);
Console.WriteLine();
for (int clusterNumber = 0; clusterNumber < clusters.NumberOfClusters; clusterNumber++)
{
int[] members = clusters.Cluster(clusterNumber);
Console.Write("Cluster number {0} contains: ", clusterNumber);
for (int i = 0; i < members.Length; i++)
{
Console.Write("{0} ", consensusMatrix.Labels[members[i]]);
}
Console.WriteLine();
}
Console.WriteLine();
Console.WriteLine("Press Enter Key");
Console.Read();
}
static DataFrame ReadDataFromFile(string fileName)
{
DataFrame data;
try
{
// Load the example data into a DataFrame
data = DataFrame.Load(fileName, true, true, "\t", true);
}
catch (NMathException e)
{
string msg = string.Format("Could not load data file {0}.", fileName);
msg += Environment.NewLine;
msg += e.Message;
msg += Environment.NewLine;
msg += "Data file must have the same name as the example source ";
msg += Environment.NewLine;
msg += "file and be located three directories up from where the ";
msg += Environment.NewLine;
msg += "executable is running.";
Console.WriteLine(msg);
Console.WriteLine();
return null;
}
return data;
}
}
public class ConsensusMatrixDistance
{
private ConnectivityMatrix consensusMatrix;
public ConsensusMatrixDistance(ConnectivityMatrix conn)
{
consensusMatrix = conn;
}
public double CaDistance(DoubleVector data1, DoubleVector data2)
{
int i = (int)data1[0];
int j = (int)data2[0];
return 1.0 - consensusMatrix[i, j];
}
}
}
[TOC]