VB.NET NMF Consensus Matrix Example

[TOC]

Imports System
Imports System.Text

Imports CenterSpace.NMath.Core
Imports CenterSpace.NMath.Stats

Namespace CenterSpace.NMath.Stats.Examples.VisualBasic

  ' A .NET example in VB.NET showing how to compute a consensus matrix averaging different NMF clusterings.
  '
  ' A Nonnegative Matrix Factorization (NMF) is an approximate factorization
  ' of a positive matrix v into a product of two matrices w and h:
  ' v ~ wh
  ' This factorization can by used to group, or cluster, the columns of v
  ' (the columns of v are usually refered to as "samples"). NMF uses an
  ' iterative algorithm with random starting values for w and h. This, coupled
  ' with the fact that the factorization is not unique, means that if you cluster
  ' the columns of v using an NMF cluster several different times, you may get several
  ' different clusterings. The NMF consensus matrix is a way to average 
  ' the possibly different clusterings, and is computed using the following process:
  ' 
  ' Cluster the columns of v using NMF n times. Each NMF clustering will yield 
  ' a "connectivity matrix". The connectivity matrix is a symmetric matrix 
  ' whose i, jth entry is 1 if columns i and j of v were clustered together,
  ' and 0 if they were not. The "consensus matrix" is also a symmetric matrix
  ' whose i, jth entry is formed by taking the average of the i, jth entries of
  ' the n connectivity matrices. 
  '
  ' It is clear that each i, jth entry of the consensus matrix has a value between 0
  ' (columns i and j were not clustered together on any of the n runs) and 1 (columns
  ' i and j were clustered together on all n runs). Thus the i, jth entry of a 
  ' consensus matrix may be considered, in some sense, a "probability" that columns
  ' i and j belong to the same cluster. 
  ' A consensus matrix C may also used to perform a hierarchical clustering of the 
  ' columns of v by using as the distance function:
  '
  ' distance between columns i and j = 1.0 - C[i,j]
  '
  ' This is demonstrated in the example below.

  Module NMFConsensusMatrixExample

    Sub Main()

      Console.WriteLine()

      ' Read in some data...
      Dim Filename As String = "..\\..\\nmf_data.dat"
      Dim Data As DataFrame = ReadDataFromFile(Filename)
      If (Data Is Nothing) Then ' Problem reading data!
        Return
      End If

      ' Extract the data as a DoubleMatrix.
      Dim V As DoubleMatrix = Data.ToDoubleMatrix()

      ' Set the order of the NMF (this is the number of columns in w, where
      ' v ~ wh
      Dim K As Integer = 3

      ' Set the number of runs or connectivity matrices to use to form the 
      ' consensus matrix.
      Dim NumberOfRuns As Integer = 70

      ' Construct a consensus matrix using the "divergence" update
      ' algorithm.
      Dim ConsensusMatrix As New NMFConsensusMatrix(Of NMFDivergenceUpdate)(V, Data.ColumnHeaders, K, NumberOfRuns)

      ' Print out the number of runs in which the NMF algorithm actually converged to an answer, and the 
      ' resulting consensus matrix.
      Console.WriteLine("{0} runs out of {1} converged.", ConsensusMatrix.NumberOfConvergedRuns, NumberOfRuns)
      Console.WriteLine()
      Console.WriteLine("Consensus Matrix:")
      Console.WriteLine(ConsensusMatrix.ToTabDelimited("F5"))

      ' Let's look at the first column and for each successive column print out the 
      ' "probability" that they are clustered together (we'll use the column
      ' names from the data frame instead of column numbers).
      Dim Label As String = ConsensusMatrix.Labels(0)
      Console.WriteLine()

      Dim J As Integer

      For J = 1 To (ConsensusMatrix.Order - 1)
        Console.WriteLine("The ""probability"" that {0} is clustered with {1} is {2}", _
          Label, ConsensusMatrix.Labels(J), ConsensusMatrix(0, J))
      Next

      ' Perform a hierarchical cluster analysis using the consensus matrix 
      ' to define the distance function as described in the class description
      ' above.

      ' The cluster analysis class wants to cluster the rows of a matrix. Since we 
      ' are essentially clustering a bunch of column numbers, we'll provide a matrix
      ' with one column and n rows where n is the number of columns of v (and the
      ' order of of the consensus matrix). The column will contain the numbers 0
      ' to n - 1 (basically, we're just clustering the numbers 0,...,n - 1).
      Dim ItemNumbers As New DoubleMatrix(ConsensusMatrix.Order, 1, 0, 1)

      ' The distance function object holds the consensus matrix C and returns the distance
      ' between i and j as 1.0 - C[i,j]
      Dim DistanceFunctionObject As New ConsensusMatrixDistance(ConsensusMatrix)
      Dim ClusterAnalysisDist As New Distance.Function(AddressOf DistanceFunctionObject.CaDistance)
      Dim CA As New ClusterAnalysis(ItemNumbers, ClusterAnalysisDist)

      ' Form three clusters using the cluster analysis cut tree function and print them out.
      Dim ClusterS As ClusterSet = CA.CutTree(3)
      Console.WriteLine()
      Dim ClusterNumber As Integer
      Dim I As Integer
      For ClusterNumber = 0 To (ClusterS.NumberOfClusters - 1)
        Dim Members() As Integer = ClusterS.Cluster(ClusterNumber)
        Console.Write("Cluster number {0} contains: ", ClusterNumber)
        For I = 0 To (Members.Length - 1)
          Console.Write("{0} ", ConsensusMatrix.Labels(Members(I)))
        Next
        Console.WriteLine()
      Next

      Console.WriteLine()
      Console.WriteLine("Press Enter Key")
      Console.Read()

    End Sub

    Function ReadDataFromFile(ByVal FileName As String) As DataFrame
      Dim Data As DataFrame
      Try
        ' Load the example data into a DataFrame
        Data = DataFrame.Load(FileName, True, True, ControlChars.Tab, True)
      Catch E As NMathException
        Dim Msg As New StringBuilder(String.Format("Could not find data file {0}.", FileName))
        Msg.Append(Environment.NewLine)
        Msg.Append(E.Message)
        Msg.Append(Environment.NewLine)
        Msg.Append("Data file must have the same name as the example source ")
        Msg.Append(Environment.NewLine)
        Msg.Append("file and be located three directories up from where the ")
        Msg.Append(Environment.NewLine)
        Msg.Append("executable is running.")
        Console.WriteLine(Msg)
        Console.WriteLine()
        Return Nothing
      End Try
      Return Data

    End Function

  End Module

  Class ConsensusMatrixDistance

    Private ConsensusMatrix As ConnectivityMatrix

    Public Sub New(ByVal Conn As ConnectivityMatrix)
      ConsensusMatrix = Conn
    End Sub

    Public Function CaDistance(ByVal Data1 As DoubleVector, ByVal Data2 As DoubleVector) As Double

      Dim I As Integer = Data1(0)
      Dim J As Integer = Data2(0)
      Return 1.0 - ConsensusMatrix(I, J)

    End Function

  End Class

End Namespace

[TOC]