[TOC]
Imports System
Imports System.Text
Imports CenterSpace.NMath.Core
Imports CenterSpace.NMath.Stats
Namespace CenterSpace.NMath.Stats.Examples.VisualBasic
' A .NET example in VB.NET showing how to compute a consensus matrix averaging different NMF clusterings.
'
' A Nonnegative Matrix Factorization (NMF) is an approximate factorization
' of a positive matrix v into a product of two matrices w and h:
' v ~ wh
' This factorization can by used to group, or cluster, the columns of v
' (the columns of v are usually refered to as "samples"). NMF uses an
' iterative algorithm with random starting values for w and h. This, coupled
' with the fact that the factorization is not unique, means that if you cluster
' the columns of v using an NMF cluster several different times, you may get several
' different clusterings. The NMF consensus matrix is a way to average
' the possibly different clusterings, and is computed using the following process:
'
' Cluster the columns of v using NMF n times. Each NMF clustering will yield
' a "connectivity matrix". The connectivity matrix is a symmetric matrix
' whose i, jth entry is 1 if columns i and j of v were clustered together,
' and 0 if they were not. The "consensus matrix" is also a symmetric matrix
' whose i, jth entry is formed by taking the average of the i, jth entries of
' the n connectivity matrices.
'
' It is clear that each i, jth entry of the consensus matrix has a value between 0
' (columns i and j were not clustered together on any of the n runs) and 1 (columns
' i and j were clustered together on all n runs). Thus the i, jth entry of a
' consensus matrix may be considered, in some sense, a "probability" that columns
' i and j belong to the same cluster.
' A consensus matrix C may also used to perform a hierarchical clustering of the
' columns of v by using as the distance function:
'
' distance between columns i and j = 1.0 - C[i,j]
'
' This is demonstrated in the example below.
Module NMFConsensusMatrixExample
Sub Main()
Console.WriteLine()
' Read in some data...
Dim Filename As String = "..\\..\\nmf_data.dat"
Dim Data As DataFrame = ReadDataFromFile(Filename)
If (Data Is Nothing) Then ' Problem reading data!
Return
End If
' Extract the data as a DoubleMatrix.
Dim V As DoubleMatrix = Data.ToDoubleMatrix()
' Set the order of the NMF (this is the number of columns in w, where
' v ~ wh
Dim K As Integer = 3
' Set the number of runs or connectivity matrices to use to form the
' consensus matrix.
Dim NumberOfRuns As Integer = 70
' Construct a consensus matrix using the "divergence" update
' algorithm.
Dim ConsensusMatrix As New NMFConsensusMatrix(Of NMFDivergenceUpdate)(V, Data.ColumnHeaders, K, NumberOfRuns)
' Print out the number of runs in which the NMF algorithm actually converged to an answer, and the
' resulting consensus matrix.
Console.WriteLine("{0} runs out of {1} converged.", ConsensusMatrix.NumberOfConvergedRuns, NumberOfRuns)
Console.WriteLine()
Console.WriteLine("Consensus Matrix:")
Console.WriteLine(ConsensusMatrix.ToTabDelimited("F5"))
' Let's look at the first column and for each successive column print out the
' "probability" that they are clustered together (we'll use the column
' names from the data frame instead of column numbers).
Dim Label As String = ConsensusMatrix.Labels(0)
Console.WriteLine()
Dim J As Integer
For J = 1 To (ConsensusMatrix.Order - 1)
Console.WriteLine("The ""probability"" that {0} is clustered with {1} is {2}", _
Label, ConsensusMatrix.Labels(J), ConsensusMatrix(0, J))
Next
' Perform a hierarchical cluster analysis using the consensus matrix
' to define the distance function as described in the class description
' above.
' The cluster analysis class wants to cluster the rows of a matrix. Since we
' are essentially clustering a bunch of column numbers, we'll provide a matrix
' with one column and n rows where n is the number of columns of v (and the
' order of of the consensus matrix). The column will contain the numbers 0
' to n - 1 (basically, we're just clustering the numbers 0,...,n - 1).
Dim ItemNumbers As New DoubleMatrix(ConsensusMatrix.Order, 1, 0, 1)
' The distance function object holds the consensus matrix C and returns the distance
' between i and j as 1.0 - C[i,j]
Dim DistanceFunctionObject As New ConsensusMatrixDistance(ConsensusMatrix)
Dim ClusterAnalysisDist As New Distance.Function(AddressOf DistanceFunctionObject.CaDistance)
Dim CA As New ClusterAnalysis(ItemNumbers, ClusterAnalysisDist)
' Form three clusters using the cluster analysis cut tree function and print them out.
Dim ClusterS As ClusterSet = CA.CutTree(3)
Console.WriteLine()
Dim ClusterNumber As Integer
Dim I As Integer
For ClusterNumber = 0 To (ClusterS.NumberOfClusters - 1)
Dim Members() As Integer = ClusterS.Cluster(ClusterNumber)
Console.Write("Cluster number {0} contains: ", ClusterNumber)
For I = 0 To (Members.Length - 1)
Console.Write("{0} ", ConsensusMatrix.Labels(Members(I)))
Next
Console.WriteLine()
Next
Console.WriteLine()
Console.WriteLine("Press Enter Key")
Console.Read()
End Sub
Function ReadDataFromFile(ByVal FileName As String) As DataFrame
Dim Data As DataFrame
Try
' Load the example data into a DataFrame
Data = DataFrame.Load(FileName, True, True, ControlChars.Tab, True)
Catch E As NMathException
Dim Msg As New StringBuilder(String.Format("Could not find data file {0}.", FileName))
Msg.Append(Environment.NewLine)
Msg.Append(E.Message)
Msg.Append(Environment.NewLine)
Msg.Append("Data file must have the same name as the example source ")
Msg.Append(Environment.NewLine)
Msg.Append("file and be located three directories up from where the ")
Msg.Append(Environment.NewLine)
Msg.Append("executable is running.")
Console.WriteLine(Msg)
Console.WriteLine()
Return Nothing
End Try
Return Data
End Function
End Module
Class ConsensusMatrixDistance
Private ConsensusMatrix As ConnectivityMatrix
Public Sub New(ByVal Conn As ConnectivityMatrix)
ConsensusMatrix = Conn
End Sub
Public Function CaDistance(ByVal Data1 As DoubleVector, ByVal Data2 As DoubleVector) As Double
Dim I As Integer = Data1(0)
Dim J As Integer = Data2(0)
Return 1.0 - ConsensusMatrix(I, J)
End Function
End Class
End Namespace
[TOC]