diff --git a/sdpa-c/sdpa-c-bug/log b/sdpa-c/sdpa-c-bug/log
new file mode 100644
index 0000000000000000000000000000000000000000..fecaaa4d455d29529139b3d208f87aaf10755f34
--- /dev/null
+++ b/sdpa-c/sdpa-c-bug/log
@@ -0,0 +1,48 @@
+Before lock
+After lock
+CN: 99232 3
+After unlock
+BREAK 99232 3
+End T
+Before lock
+After lock
+CN: 99233 3
+After unlock
+BREAK 99233 3
+End T
+Before lock
+After lock
+CN: 99234 3
+After unlock
+BREAK 99234 3
+End T
+Before lock
+After lock
+CN: 99235 3
+After unlock
+BREAK 99235 3
+End T
+Before lock
+After lock
+CN: 99236 3
+After unlock
+BREAK 99236 3
+End T
+Before lock
+After lock
+CN: 99237 3
+After unlock
+BREAK 99237 3
+End T
+Before lock
+After lock
+CN: 99238 3
+After unlock
+BREAK 99238 3
+End T
+Before lock
+After lock
+CN: 99239 3
+After unlock
+BREAK 99239 3
+End T
diff --git a/sdpa-c/sdpa-c-bug/mail b/sdpa-c/sdpa-c-bug/mail
new file mode 100644
index 0000000000000000000000000000000000000000..f1f224c1deb0a85259b6e17aebb02d356a123f3b
--- /dev/null
+++ b/sdpa-c/sdpa-c-bug/mail
@@ -0,0 +1,30 @@
+Hello!
+
+I compiled sdpa-c with gcc version 8.3.0 (Debian 8.3.0-6)
+from source code taken from
+https://sourceforge.net/projects/sdpa/files/sdpa-c/sdpa-c.7.3.8.src.20180125.tar.gz
+and then ran libexample/example1.exe it fail on Segmentation fail in
+`Newton::compute_bMatgVec_dense_threads_SDP` function.
+
+
+After some debugging I noticed that variable `Column_Number` overflows and
+becomes negative, which easily crashes the program. But why it overflows? This
+should not happen because its thread should immediately exit once it is bigger
+than `SDP_nConstraintl`, which is actually very small. I start debugging
+program using debugging prints [sdpa_newton_debug.cpp] and it look that
+something here is really wrong. After last command in the thread function
+program return to `for (int k1)` loop, increment Column_Number, break and
+repeat this part one again (part of output in [log]). But why? There is no
+`goto` statement, thread should immediate exit. Actually gcc add this goto by
+itself. The function should return `void*`, so exiting the function without any
+return id undefined behavior. So gcc could do anything during the optimization.
+Explicitly in this case it start repeat the loop. Notice, that if optimization
+of turned off, everything work well.
+
+How to solve that?
+In is easy, just add `return NULL` on the end of this function.
+In this file is more such functions, so please do it for each of them
+as is done in [sdpa_newton_fix.cpp].
+
+Thanks
+                Jiri Kalvoda
diff --git a/sdpa-c/sdpa-c-bug/sdpa_newton_debug.cpp b/sdpa-c/sdpa-c-bug/sdpa_newton_debug.cpp
new file mode 100644
index 0000000000000000000000000000000000000000..818b85ccf548d7a81c942beee40771798ee78d33
--- /dev/null
+++ b/sdpa-c/sdpa-c-bug/sdpa_newton_debug.cpp
@@ -0,0 +1,1898 @@
+/* -------------------------------------------------------------
+
+This file is a component of SDPA
+Copyright (C) 2004-2013 SDPA Project
+
+This program is free software; you can redistribute it and/or modify
+it under the terms of the GNU General Public License as published by
+the Free Software Foundation; either version 2 of the License, or
+(at your option) any later version.
+
+This program is distributed in the hope that it will be useful,
+but WITHOUT ANY WARRANTY; without even the implied warranty of
+MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+GNU General Public License for more details.
+
+You should have received a copy of the GNU General Public License
+along with this program; if not, write to the Free Software
+Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA  02111-1307 USA
+
+------------------------------------------------------------- */
+
+#include "sdpa_newton.h"
+#include "sdpa_parts.h"
+#include "sdpa_linear.h"  
+#include "sdpa_algebra.h"
+
+namespace sdpa {
+
+pthread_mutex_t Newton::job_mutex = PTHREAD_MUTEX_INITIALIZER;
+int Newton::Column_Number   = 0;
+int Newton::Column_NumberDx = 0;
+
+
+Newton::Newton()
+{
+  // Caution: if SDPA doesn't use sparse bMat, 
+  //          following variables are indefinite.
+  this->SDP_nBlock = -1;
+  SDP_number = NULL;  SDP_location_sparse_bMat = NULL;
+  SDP_constraint1 = NULL;  SDP_constraint2 = NULL;
+  SDP_blockIndex1 = NULL;  SDP_blockIndex2 = NULL;
+  this->LP_nBlock = -1;
+  LP_number = NULL;  LP_location_sparse_bMat = NULL;
+  LP_constraint1 = NULL;  LP_constraint2 = NULL;
+  LP_blockIndex1 = NULL;  LP_blockIndex2 = NULL;
+
+  diagonalIndex = NULL;
+  NUM_THREADS  = 1;
+}
+
+Newton::Newton(int m, BlockStruct& bs)
+{
+  initialize(m, bs);
+}
+
+Newton::~Newton()
+{
+  finalize();
+}
+
+void Newton::initialize(int m, BlockStruct& bs)
+{
+  gVec.initialize(m);
+
+  SDP_nBlock  = bs.SDP_nBlock;
+  LP_nBlock   = bs.LP_nBlock;
+  
+  bMat_type = DENSE;
+  // Caution: if SDPA doesn't use sparse bMat, 
+  //          following variables are indefinite.
+  this->SDP_nBlock = -1;
+  SDP_number = NULL;  SDP_location_sparse_bMat = NULL;
+  SDP_constraint1 = NULL;  SDP_constraint2 = NULL;
+  SDP_blockIndex1 = NULL;  SDP_blockIndex2 = NULL;
+  this->LP_nBlock = -1;
+  LP_number = NULL;  LP_location_sparse_bMat = NULL;
+  LP_constraint1 = NULL;  LP_constraint2 = NULL;
+  LP_blockIndex1 = NULL;  LP_blockIndex2 = NULL;
+
+  diagonalIndex = NULL;
+}
+
+void Newton::finalize()
+{
+
+  if (bMat_type == SPARSE){
+
+    if (SDP_location_sparse_bMat && SDP_constraint1 && SDP_constraint2
+	&& SDP_blockIndex1 && SDP_blockIndex2) {
+      for (int l=0; l<SDP_nBlock; ++l) {
+	DeleteArray(SDP_location_sparse_bMat[l]);
+	DeleteArray(SDP_constraint1[l]);
+	DeleteArray(SDP_constraint2[l]);
+	DeleteArray(SDP_blockIndex1[l]);
+	DeleteArray(SDP_blockIndex2[l]);
+	DeleteArray(SDP_startIndex2[l]);
+      }
+      DeleteArray(SDP_number);
+      DeleteArray(SDP_location_sparse_bMat);
+      DeleteArray(SDP_constraint1);
+      DeleteArray(SDP_constraint2);
+      DeleteArray(SDP_blockIndex1);
+      DeleteArray(SDP_blockIndex2);
+      DeleteArray(SDP_startIndex2);
+      DeleteArray(SDP_nStartIndex2);
+    }
+    if (LP_location_sparse_bMat && LP_constraint1 && LP_constraint2
+	&& LP_blockIndex1 && LP_blockIndex2) {
+      for (int l=0; l<LP_nBlock; ++l) {
+	DeleteArray(LP_location_sparse_bMat[l]);
+	DeleteArray(LP_constraint1[l]);
+	DeleteArray(LP_constraint2[l]);
+	DeleteArray(LP_blockIndex1[l]);
+	DeleteArray(LP_blockIndex2[l]);
+	DeleteArray(LP_startIndex2[l]);
+      }
+      DeleteArray(LP_number);
+      DeleteArray(LP_location_sparse_bMat);
+      DeleteArray(LP_constraint1);
+      DeleteArray(LP_constraint2);
+      DeleteArray(LP_blockIndex1);
+      DeleteArray(LP_blockIndex2);
+      DeleteArray(LP_startIndex2);
+      DeleteArray(LP_nStartIndex2);
+    }
+
+    DeleteArray(diagonalIndex);
+    sparse_bMat.finalize();
+
+  } else { // bMat_type == DENSE
+    bMat.finalize();
+  }
+
+  int m = gVec.nDim;
+  gVec.finalize();
+}
+
+void Newton::initialize_dense_bMat(int m)
+{
+  //  bMat_type = DENSE;
+  //  printf("DENSE computations\n");
+  bMat.initialize(m,m);
+}
+
+  // 2008/03/12 kazuhide nakata
+void Newton::initialize_sparse_bMat(int m)
+{
+
+  //  bMat_type = SPARSE;
+  //  printf("SPARSE computation\n");
+
+  // initialize sparse_bMat by Chordal::makeGraph
+  //  sparse_bMat.display();
+
+  bool isEmptyMatrix = false;
+  // make index of diagonalIndex
+  NewArray(diagonalIndex,int,m+1);
+  int k=0;
+  for (int index=0; index<sparse_bMat.NonZeroCount; index++){
+    if (sparse_bMat.row_index[index] == sparse_bMat.column_index[index]) {
+      diagonalIndex[k] = index;
+      if (sparse_bMat.row_index[index] != k+1) {
+	rMessage("The matrix [" << (sparse_bMat.row_index[index]-1)
+		 << "] is empty");
+	isEmptyMatrix = true;
+	diagonalIndex[k+1] = diagonalIndex[k];
+	k++;
+      }
+      k++;
+    }
+  }
+  if (isEmptyMatrix) {
+    rMessage("Input Data Error :: Some Input Matricies are Empty");
+  }
+    
+  diagonalIndex[m] = sparse_bMat.NonZeroCount;
+  #if 0
+  rMessage("diagonalIndex = ");
+  for (int index=0; index <m; ++index) {
+    printf(" [%d:%d]",index,diagonalIndex[index]);
+  }
+  printf("\n");
+  #endif
+}
+
+  // 2008/03/12 kazuhide nakata
+void Newton::initialize_bMat(int m, Chordal& chordal,
+			     InputData& inputData,
+			     FILE* Display,
+                             FILE* fpOut)
+{
+  /* Create clique tree */
+
+  switch (chordal.best) {
+  case SELECT_DENSE: 
+    bMat_type = DENSE;
+    if (Display) {
+      fprintf(Display,"Schur computation : DENSE \n");
+    }
+    if (fpOut) {
+      fprintf(fpOut,"Schur computation : DENSE \n");
+    }
+    initialize_dense_bMat(m);
+    // Here, we release MUMPS and sparse_bMat
+    chordal.finalize();
+    break;
+  case SELECT_MUMPS_BEST: 
+    bMat_type = SPARSE;
+    if (Display) {
+      fprintf(Display,"Schur computation : SPARSE \n");
+    }
+    if (fpOut) {
+      fprintf(fpOut,"Schur computation : SPARSE \n");
+    }
+    initialize_sparse_bMat(m);
+    make_aggrigateIndex(inputData);
+    break;
+  default: 
+    rError("Wrong Ordering Obtained");
+    break;
+  }
+
+}
+
+int Newton::binarySearchIndex(int i, int j)
+{
+  // binary search for index of sparse_bMat 
+  int t = -1;
+  // We store only the lower triangular
+  int ii = i, jj = j;
+  if (i<j) {
+    jj = i;
+    ii = j;
+  }
+  int begin  = diagonalIndex[jj]; 
+  int end    = diagonalIndex[jj+1]-1;
+  int target = (begin + end) / 2;
+  while (end - begin > 1){
+    if (sparse_bMat.row_index[target] < ii+1){
+      begin = target;
+      target = (begin + end) / 2;
+    } else if (sparse_bMat.row_index[target] > ii+1){
+      end = target;
+      target = (begin + end) / 2;
+    } else if (sparse_bMat.row_index[target] == ii+1) {
+      t = target;
+      break;
+    }
+  }
+  if (t == -1){
+    if (sparse_bMat.row_index[begin] == ii+1){
+      t = begin;
+    } else if (sparse_bMat.row_index[end] == ii+1){
+      t = end;
+    } else {
+      #if 0
+      int m = sparse_bMat.nRow;
+      rMessage("Trouble ii = " << ii << " jj = " << j <<  " m = " << m);
+      for (int k = 0; k<sparse_bMat.NonZeroCount; ++k) {
+	fprintf(stdout,"[%04d,%04d] at %04d\n",
+		sparse_bMat.row_index[k],
+		sparse_bMat.column_index[k],
+		k);
+      }
+      #endif
+      // rError("Newton::make_aggrigateIndex program bug");
+    }
+  } 
+  return t;
+}
+  
+
+void Newton::make_aggrigateIndex_SDP(InputData& inputData)
+{
+
+  SDP_nBlock = inputData.SDP_nBlock;
+  NewArray(SDP_number,int,SDP_nBlock);
+
+  // memory allocate for aggrigateIndex
+  NewArray(SDP_constraint1,int*,SDP_nBlock);
+  NewArray(SDP_constraint2,int*,SDP_nBlock);
+  NewArray(SDP_blockIndex1,int*,SDP_nBlock);
+  NewArray(SDP_blockIndex2,int*,SDP_nBlock);
+  NewArray(SDP_location_sparse_bMat,int*,SDP_nBlock);
+  NewArray(SDP_nStartIndex2, int, SDP_nBlock);
+  NewArray(SDP_startIndex2, int*, SDP_nBlock);
+
+  for (int l=0; l<SDP_nBlock; l++){
+    const int size = (inputData.SDP_nConstraint[l] + 1) 
+      * inputData.SDP_nConstraint[l] / 2;
+    SDP_number[l] = size;
+    NewArray(SDP_constraint1[l],int,size);
+    NewArray(SDP_constraint2[l],int,size);
+    NewArray(SDP_blockIndex1[l],int,size);
+    NewArray(SDP_blockIndex2[l],int,size);
+    NewArray(SDP_location_sparse_bMat[l],int,size);
+  }
+
+  for (int l = 0; l<SDP_nBlock; l++){
+    int NonZeroCount = 0;
+    vector<int> startTmp;
+
+    for (int k1=0; k1<inputData.SDP_nConstraint[l]; k1++){
+      int j = inputData.SDP_constraint[l][k1];
+      int jb = inputData.SDP_blockIndex[l][k1];
+      startTmp.push_back(NonZeroCount);
+
+      for (int k2=0; k2<inputData.SDP_nConstraint[l]; k2++){
+	int i = inputData.SDP_constraint[l][k2];
+	int ib = inputData.SDP_blockIndex[l][k2];
+
+	if (i < j) {
+	  continue;
+	}
+	// set index which A_i and A_j are not zero matrix
+	int target = binarySearchIndex(i,j);
+	if (target == -1) {
+	  rMessage("("<<(i+1)<<","<<(j+1)<<") might have index");
+	  SDP_number[l]--;
+	  continue;
+	}
+	SDP_constraint1[l][NonZeroCount] = i;
+	SDP_constraint2[l][NonZeroCount] = j;
+	SDP_blockIndex1[l][NonZeroCount] = ib;
+	SDP_blockIndex2[l][NonZeroCount] = jb;
+	SDP_location_sparse_bMat[l][NonZeroCount] = target;
+	NonZeroCount++;
+      }
+    } // for k1
+    // The last element to stop the array
+    startTmp.push_back(NonZeroCount);
+    const int startTmp_size = startTmp.size();
+    SDP_nStartIndex2[l] = startTmp_size-1; // the last is the stopper
+    NewArray(SDP_startIndex2[l], int, startTmp_size);
+    for (int index1 = 0; index1 < startTmp_size; ++index1) {
+      SDP_startIndex2[l][index1] = startTmp[index1];
+    }
+    #if 0
+    rMessage("SDP_startIndex["<<l
+	     <<"][" << startTmp_size << "] = ");
+    for (int index1 = 0; index1 < startTmp_size; ++index1) {
+      printf("%d->%d ", index1,startTmp[index1]);
+    }
+    #endif
+  } //for l  lth block
+}
+
+void Newton::make_aggrigateIndex_LP(InputData& inputData)
+{
+  LP_nBlock = inputData.LP_nBlock;
+  NewArray(LP_number,int,LP_nBlock);
+
+  // memory allocate for aggrigateIndex
+  NewArray(LP_constraint1,int*,LP_nBlock);
+  NewArray(LP_constraint2,int*,LP_nBlock);
+  NewArray(LP_blockIndex1,int*,LP_nBlock);
+  NewArray(LP_blockIndex2,int*,LP_nBlock);
+  NewArray(LP_location_sparse_bMat,int*,LP_nBlock);
+  NewArray(LP_nStartIndex2, int, LP_nBlock);
+  NewArray(LP_startIndex2, int*, LP_nBlock);
+
+  for (int l=0; l<LP_nBlock; l++){
+    int size = (inputData.LP_nConstraint[l]+1)
+      * inputData.LP_nConstraint[l]/2;
+    LP_number[l] = size;
+    NewArray(LP_constraint1[l],int,size);
+    NewArray(LP_constraint2[l],int,size);
+    NewArray(LP_blockIndex1[l],int,size);
+    NewArray(LP_blockIndex2[l],int,size);
+    NewArray(LP_location_sparse_bMat[l],int,size);
+  }
+
+  for (int l = 0; l<LP_nBlock; l++){
+    int NonZeroCount = 0;
+    vector<int> startTmp;
+
+    for (int k1=0; k1<inputData.LP_nConstraint[l]; k1++){
+      int j = inputData.LP_constraint[l][k1];
+      int jb = inputData.LP_blockIndex[l][k1];
+      startTmp.push_back(NonZeroCount);
+
+      for (int k2=0; k2<inputData.LP_nConstraint[l]; k2++){
+	int i = inputData.LP_constraint[l][k2];
+	int ib = inputData.LP_blockIndex[l][k2];
+
+	if (i < j){
+	  continue;
+	}
+
+	// set index which A_i and A_j are not zero matrix
+	LP_constraint1[l][NonZeroCount] = i;
+	LP_constraint2[l][NonZeroCount] = j;
+	LP_blockIndex1[l][NonZeroCount] = ib;
+	LP_blockIndex2[l][NonZeroCount] = jb;
+	
+	int target = binarySearchIndex(i,j);
+	LP_location_sparse_bMat[l][NonZeroCount] = target;
+	NonZeroCount++;
+      }
+    } // for k1
+    // The last element to stop the array
+    startTmp.push_back(NonZeroCount);
+    const int startTmp_size = startTmp.size();
+    LP_nStartIndex2[l] = startTmp_size-1; // the last is the stopper
+    NewArray(LP_startIndex2[l], int, startTmp_size);
+    for (int index1 = 0; index1 < startTmp_size; ++index1) {
+      LP_startIndex2[l][index1] = startTmp[index1];
+    }
+  } //for l  lth block
+}
+
+void Newton::make_aggrigateIndex(InputData& inputData)
+{
+  make_aggrigateIndex_SDP(inputData);
+  make_aggrigateIndex_LP(inputData);
+}
+
+
+void Newton::setNumThreads(FILE* Display, FILE* fpOut, int NumThreads)
+{
+  if (NumThreads == 0) { // Automatic from OMP_NUM_THREADS
+    char* env1      = NULL;
+    env1 = getenv("OMP_NUM_THREADS");
+    if (env1 != NULL) {
+      NUM_THREADS = atoi(env1);
+      if (NUM_THREADS < 0) {
+	rError("OMP_NUM_THREADS is negative!!");
+      }
+    }
+    else {
+      NUM_THREADS = 1;
+    }
+  }
+  else {
+    NUM_THREADS = NumThreads;
+  }
+  if (Display) {
+    fprintf(Display,"NumThreads  is set as %d\n", NUM_THREADS);
+  }
+  if (fpOut) {
+    fprintf(fpOut,  "NumThreads  is set as %d\n", NUM_THREADS);
+  }
+  // This affects for only OpenBLAS
+  blas_set_num_threads(NUM_THREADS);
+}
+
+void Newton::compute_bMatgVec_dense(InputData& inputData,
+				    Solutions& currentPt,
+				    Residuals& currentRes,
+				    AverageComplementarity& mu,
+				    DirectionParameter& beta,
+				    Phase& phase,
+				    ComputeTime& com)
+{
+  const int m = currentPt.mDim;
+  const int LP_nBlock    = inputData.LP_nBlock;
+  const double target_mu = beta.value * mu.current;
+  // rMessage("mu.current = " << mu.current);
+  // rMessage("target_mu = " << target_mu);
+  CholmodSpace& cholmodSpace = currentPt.cholmodSpace;
+  // rMessage("cholmodSpace = "); cholmodSpace.display();
+  TimeStart(LP_START);
+  for (int l=0; l<LP_nBlock; ++l) {
+    const double xMat    = cholmodSpace.LP_X[l];
+    const double invzMat = cholmodSpace.LP_invZ[l];
+    const double rD      = cholmodSpace.LP_rD[l];
+    for (int k1=0; k1<inputData.LP_nConstraint[l]; k1++) {
+      const int j = inputData.LP_constraint[l][k1];
+      const int jb = inputData.LP_blockIndex[l][k1];
+      const double Aj = inputData.A[j].LP_sp_block[jb];
+      const double xMatInvzMatAj = xMat * invzMat * Aj;
+      for (int k2=k1; k2<inputData.LP_nConstraint[l]; k2++) {
+	int i = inputData.LP_constraint[l][k2];
+	int ib = inputData.LP_blockIndex[l][k2];
+	double Ai = inputData.A[i].LP_sp_block[ib];
+	double value =  xMatInvzMatAj * Ai;
+	// only lower triangular will be computed
+	// rMessage("add to (" << i << " , " << j << ")");
+	bMat.de_ele[i+m*j] += value;
+      } // end of 'for (int i)'
+      double gadd = 0.0;
+      if (phase.value == SolveInfo:: pFEAS
+	  || phase.value == SolveInfo::noINFO) {
+	gadd = xMat * rD;
+      }
+      gVec.ele[j] += Aj * invzMat * (gadd - target_mu);
+    } // end of 'for (int j)'
+  } // end of 'for (int l)'
+  TimeEnd(LP_END);
+  com.B_DIAG += TimeCal(LP_START, LP_END);
+
+  TimeStart(SDP_START);
+  const int SDP_nBlock = inputData.SDP_nBlock;
+
+  for (int l=0; l<SDP_nBlock; ++l) {
+    CholmodMatrix& cholmodMatrix = cholmodSpace.SDP_block[l];
+    for (int k1=0; k1<inputData.SDP_nConstraint[l]; k1++) {
+      int j = inputData.SDP_constraint[l][k1];
+      int jb = inputData.SDP_blockIndex[l][k1];
+      // rMessage("j = " << j << " : jb = " << jb);
+      CompMatrix& Aj = inputData.A[j].SDP_sp_block[jb];
+
+      const int nCol = Aj.nzColumn;
+      double gj1 = 0.0;
+      double gj2 = 0.0;
+      for (int k_index = 0; k_index < nCol; ++k_index) {
+	const int k = Aj.column_index[k_index];
+	// rMessage("k = " << k << ", k_index = " << k_index);
+	// rMessage("j = " << j << " : jb = " << jb);
+	cholmodMatrix.setB_Zzero();
+	double* b_z = (double*)(cholmodMatrix.b_z->x);
+	const int i_start = Aj.column_start[k_index];
+	const int i_end   = Aj.column_start[k_index+1];
+	for (int i_index = i_start; i_index < i_end; ++i_index) {
+	  const int i2 = Aj.row_index[i_index];
+	  b_z[i2] = Aj.ele[i_index];
+	}
+	cholmodMatrix.solveByZ();
+	cholmodMatrix.setB_Xzero();
+	double* b_x = (double*)(cholmodMatrix.b_x->x);
+	b_x[k] = 1.0;
+	cholmodMatrix.solveByX();
+	double* x_z = (double*)(cholmodMatrix.x_z->x);
+	double* x_x = (double*)(cholmodMatrix.x_x->x);
+
+	#if 0
+	rMessage("Aj = "); Aj.display();
+	rMessage(" b_x = ");
+	CholmodMatrix::display_dense(cholmodMatrix.b_x);
+	rMessage(" x_x = ");
+	CholmodMatrix::display_dense(cholmodMatrix.x_x);
+	rMessage(" b_z = ");
+	CholmodMatrix::display_dense(cholmodMatrix.b_z);
+	rMessage(" x_z = ");
+	CholmodMatrix::display_dense(cholmodMatrix.x_z);
+	// rMessage(" rD = ");
+	// CholmodMatrix::display_sparse(cholmodMatrix.rD);
+	#endif
+	
+	double gadd = 0.0;
+	if (phase.value == SolveInfo:: pFEAS
+	    || phase.value == SolveInfo::noINFO) {
+	  Lal::getInnerProduct(gadd, cholmodMatrix.rD, x_x, x_z);
+	}
+	// rMessage("gj1 = " << gadd << " : gj2 = " << x_z[k] << " : k = " << k);
+	gj1 += gadd;
+	gj2 += x_z[k];
+	for (int k2=k1; k2<inputData.SDP_nConstraint[l]; k2++) {
+	  int i = inputData.SDP_constraint[l][k2];
+	  int ib = inputData.SDP_blockIndex[l][k2];
+	  CompMatrix& Ai = inputData.A[i].SDP_sp_block[ib];
+	  double Badd = 0.0;
+	  Lal::getInnerProduct(Badd, Ai, x_x, x_z);
+	  bMat.de_ele[i+j*m] += Badd;
+	  #if 0
+	  rMessage("i=" << i << ", j=" << j
+		   << ", k=" << k << ", Badd=" << Badd );
+	  rMessage(" b_x = ");
+	  CholmodMatrix::display_dense(cholmodMatrix.b_x);
+	  rMessage(" x_x = ");
+	  CholmodMatrix::display_dense(cholmodMatrix.x_x);
+	  rMessage(" b_z = ");
+	  CholmodMatrix::display_dense(cholmodMatrix.b_z);
+	  rMessage(" x_z = ");
+	  CholmodMatrix::display_dense(cholmodMatrix.x_z);
+	  rMessage(" Ai  = "); Ai.display();
+	  #endif
+	}
+      } // end of 'for (int k_index = 0; k_index < nCol; ++k_index)'
+      // rMessage("target_mu = " << target_mu);
+      gVec.ele[j] += (gj1 - target_mu * gj2);
+    } // end of 'for (int k1)'
+  } // end of 'for (int l)'
+
+  TimeEnd(SDP_END);
+  com.B_F2 += TimeCal(SDP_START, SDP_END);
+}
+
+void Newton::compute_bMatgVec_dense_threads(InputData& inputData,
+				    Solutions& currentPt,
+				    Residuals& currentRes,
+				    AverageComplementarity& mu,
+				    DirectionParameter& beta,
+				    Phase& phase,
+				    ComputeTime& com)
+{
+  const int m = currentPt.mDim;
+  const int LP_nBlock    = inputData.LP_nBlock;
+  const double target_mu = beta.value * mu.current;
+  // rMessage("mu.current = " << mu.current);
+  // rMessage("target_mu = " << target_mu);
+  CholmodSpace& cholmodSpace = currentPt.cholmodSpace;
+  // rMessage("cholmodSpace = "); cholmodSpace.display();
+  TimeStart(LP_START);
+  for (int l=0; l<LP_nBlock; ++l) {
+    const double xMat    = cholmodSpace.LP_X[l];
+    const double invzMat = cholmodSpace.LP_invZ[l];
+    const double rD      = cholmodSpace.LP_rD[l];
+    for (int k1=0; k1<inputData.LP_nConstraint[l]; k1++) {
+      const int j = inputData.LP_constraint[l][k1];
+      const int jb = inputData.LP_blockIndex[l][k1];
+      const double Aj = inputData.A[j].LP_sp_block[jb];
+      const double xMatInvzMatAj = xMat * invzMat * Aj;
+      for (int k2=k1; k2<inputData.LP_nConstraint[l]; k2++) {
+	int i = inputData.LP_constraint[l][k2];
+	int ib = inputData.LP_blockIndex[l][k2];
+	double Ai = inputData.A[i].LP_sp_block[ib];
+	double value =  xMatInvzMatAj * Ai;
+	// only lower triangular will be computed
+	// rMessage("add to (" << i << " , " << j << ")");
+	bMat.de_ele[i+m*j] += value;
+      } // end of 'for (int i)'
+      double gadd = 0.0;
+      if (phase.value == SolveInfo:: pFEAS
+	  || phase.value == SolveInfo::noINFO) {
+	gadd = xMat * rD;
+      }
+      gVec.ele[j] += Aj * invzMat * (gadd - target_mu);
+    } // end of 'for (int j)'
+  } // end of 'for (int l)'
+  TimeEnd(LP_END);
+  com.B_DIAG += TimeCal(LP_START, LP_END);
+
+  TimeStart(SDP_START);
+  const int SDP_nBlock = inputData.SDP_nBlock;
+
+  // To improve CHOLMOD performance, turn off BLAS parallel
+  blas_set_num_threads(1);
+  
+  int ret;
+  ret = pthread_mutex_init(&job_mutex, NULL);
+  if (ret  != 0) {
+    rError("pthread_mutex_init error");
+  }
+  pthread_t* handle;
+  NewArray(handle, pthread_t, NUM_THREADS);
+  thread_arg_t* targ;
+  NewArray(targ, thread_arg_t, NUM_THREADS);
+  
+  for (int thread_num = 0; thread_num < NUM_THREADS; ++thread_num) {
+    targ[thread_num].m              = m;
+    targ[thread_num].target_mu      = target_mu;
+    targ[thread_num].thread_num     = thread_num;
+    targ[thread_num].addr_inputData = &inputData;
+    targ[thread_num].addr_bMat      = &bMat;
+    targ[thread_num].addr_gVec      = &gVec;
+    targ[thread_num].addr_phase     = &phase;
+  }
+  for (int l=0; l<SDP_nBlock; ++l) {
+    CholmodMatrix& cholmodMatrix = cholmodSpace.SDP_block[l];
+    Column_Number = 0;
+    for (int thread_num = 0; thread_num < NUM_THREADS; ++ thread_num) {
+      targ[thread_num].l = l;
+      targ[thread_num].addr_cholmodMatrix = &cholmodMatrix;
+      pthread_create(&handle[thread_num], NULL,
+		     compute_bMatgVec_dense_threads_SDP,
+		     (void *)&targ[thread_num]);
+    }
+    for (int thread_num = 0; thread_num < NUM_THREADS; ++ thread_num) {
+      pthread_join(handle[thread_num], NULL);
+    }
+  }
+
+  DeleteArray(handle);
+  DeleteArray(targ);
+  ret = pthread_mutex_destroy(&job_mutex);
+  if (ret != 0) {
+    rError("pthread_mutex_destroy error in sdpa_newton.cpp");
+  }
+  blas_set_num_threads(NUM_THREADS);
+  TimeEnd(SDP_END);
+  com.B_F2 += TimeCal(SDP_START, SDP_END);
+}
+    
+void* Newton::compute_bMatgVec_dense_threads_SDP(void* arg) 
+{
+	printf("New T\n");
+  thread_arg_t* targ = (thread_arg_t*) arg;
+  const int l =  targ->l;
+  const int m =  targ->m;
+  const double target_mu = targ->target_mu;
+  const int thread_num = targ->thread_num;
+  InputData&     inputData     = *(targ->addr_inputData);
+  CholmodMatrix& cholmodMatrix = *(targ->addr_cholmodMatrix);
+  DenseMatrix&   bMat          = *(targ->addr_bMat);
+  Vector&        gVec          = *(targ->addr_gVec);
+  Phase&         phase         = *(targ->addr_phase);
+  CompSpace* A = inputData.A;
+  const int nDim = cholmodMatrix.nDim;
+  cholmod_common& common = cholmodMatrix.common;
+
+  const int SDP_nConstraintl = inputData.SDP_nConstraint[l];
+  int* SDP_constraintl = inputData.SDP_constraint[l];
+  int* SDP_blockIndexl = inputData.SDP_blockIndex[l];
+
+
+  // for multi-threads computing work space
+  cholmod_dense* b_x1;
+  cholmod_dense* b_z1;
+  b_x1 = cholmod_allocate_dense(nDim,1,nDim,CHOLMOD_REAL,
+				&common);
+  b_z1 = cholmod_allocate_dense(nDim,1,nDim,CHOLMOD_REAL,
+				&common);
+  double* b_x = (double*)(b_x1->x);
+  double* b_z = (double*)(b_z1->x);
+
+  cholmod_factor* Lz = cholmodMatrix.Lz;
+  cholmod_factor* Lx = cholmodMatrix.Lx;
+  int k1 = 0; // dummy initialize
+	printf("Before while\n");
+  while (1) {
+	printf("Before lock\n");
+    pthread_mutex_lock(&job_mutex);
+	printf("After lock\n");
+    k1 = Column_Number;
+    Column_Number++;
+	printf("CN: %d %d\n", Column_Number, SDP_nConstraintl);
+    pthread_mutex_unlock(&job_mutex);
+	printf("After unlock\n");
+    if (k1>=SDP_nConstraintl) {
+	printf("BREAK %d %d\n", Column_Number, SDP_nConstraintl);
+      break;
+    }
+
+    int j = SDP_constraintl[k1];
+    int jb = SDP_blockIndexl[k1];
+    // rMessage("j = " << j << " : jb = " << jb);
+    CompMatrix& Aj = A[j].SDP_sp_block[jb];
+
+    const int nCol = Aj.nzColumn;
+    double gj1 = 0.0;
+    double gj2 = 0.0;
+    for (int k_index = 0; k_index < nCol; ++k_index) {
+      const int k = Aj.column_index[k_index];
+      // rMessage("k = " << k << ", k_index = " << k_index);
+      // rMessage("j = " << j << " : jb = " << jb);
+      for (int index1=0;index1 < nDim; ++index1) {
+	b_z[index1] = 0.0;
+      }
+      const int i_start = Aj.column_start[k_index];
+      const int i_end   = Aj.column_start[k_index+1];
+      for (int i_index = i_start; i_index < i_end; ++i_index) {
+	const int i2 = Aj.row_index[i_index];
+	b_z[i2] = Aj.ele[i_index];
+      }
+
+      cholmod_dense* b2   = cholmod_solve(CHOLMOD_P, Lz, b_z1, &common);
+      cholmod_dense* x_z2 = cholmod_solve(CHOLMOD_LDLt, Lz, b2, &common);
+      cholmod_dense* x_z1 = cholmod_solve(CHOLMOD_Pt, Lz, x_z2, &common);
+      double* x_z = (double*)(x_z1->x);
+      
+      for (int index1=0;index1 < nDim; ++index1) {
+	b_x[index1] = 0.0;
+      }
+      b_x[k] = 1.0;
+      
+      cholmod_dense* b_x2 = cholmod_solve(CHOLMOD_P, Lx, b_x1,  &common);
+      cholmod_dense* x_x3 = cholmod_solve(CHOLMOD_L, Lx, b_x2, &common);
+      cholmod_dense* x_x2 = cholmod_solve(CHOLMOD_Lt,Lx, x_x3, &common);
+      cholmod_dense* x_x1 = cholmod_solve(CHOLMOD_Pt, Lx, x_x2, &common);
+      double* x_x = (double*)(x_x1->x);
+#if 0
+      rMessage("Aj = "); Aj.display();
+      rMessage(" b_x = ");
+      CholmodMatrix::display_dense(cholmodMatrix.b_x1);
+      rMessage(" x_x = ");
+      CholmodMatrix::display_dense(cholmodMatrix.x_x1);
+      rMessage(" b_z = ");
+      CholmodMatrix::display_dense(cholmodMatrix.b_z1);
+      rMessage(" x_z = ");
+      CholmodMatrix::display_dense(cholmodMatrix.x_z1);
+      // rMessage(" rD = ");
+      // CholmodMatrix::display_sparse(cholmodMatrix.rD);
+#endif
+	
+      double gadd = 0.0;
+      if (phase.value == SolveInfo::pFEAS
+	  || phase.value == SolveInfo::noINFO) {
+	Lal::getInnerProduct(gadd, cholmodMatrix.rD, x_x, x_z);
+      }
+      // rMessage("gj1 = " << gadd << " : gj2 = " << x_z[k] << " : k = " << k);
+      gj1 += gadd;
+      gj2 += x_z[k];
+      for (int k2=k1; k2<inputData.SDP_nConstraint[l]; k2++) {
+	int i = SDP_constraintl[k2];
+	int ib = SDP_blockIndexl[k2];
+	CompMatrix& Ai = A[i].SDP_sp_block[ib];
+	double Badd = 0.0;
+	Lal::getInnerProduct(Badd, Ai, x_x, x_z);
+	bMat.de_ele[i+j*m] += Badd;
+#if 0
+	rMessage("i=" << i << ", j=" << j
+		 << ", k=" << k << ", Badd=" << Badd );
+	rMessage(" b_x = ");
+	CholmodMatrix::display_dense(cholmodMatrix.b_x1);
+	rMessage(" x_x = ");
+	CholmodMatrix::display_dense(cholmodMatrix.x_x1);
+	rMessage(" b_z = ");
+	CholmodMatrix::display_dense(cholmodMatrix.b_z1);
+	rMessage(" x_z = ");
+	CholmodMatrix::display_dense(cholmodMatrix.x_z1);
+	rMessage(" Ai  = "); Ai.display();
+#endif
+      }
+      cholmod_free_dense(&b2,&common);
+      cholmod_free_dense(&x_z2,&common);
+      cholmod_free_dense(&b_x2,&common);
+      cholmod_free_dense(&x_x3,&common);
+      cholmod_free_dense(&x_x2,&common);      
+      cholmod_free_dense(&x_x1,&common);
+      cholmod_free_dense(&x_z1,&common);
+    } // end of 'for (int k_index = 0; k_index < nCol; ++k_index)'
+    // rMessage("target_mu = " << target_mu);
+    gVec.ele[j] += (gj1 - target_mu * gj2);
+  } // end of 'for (int k1)'
+  cholmod_free_dense(&b_x1,&common);
+  cholmod_free_dense(&b_z1,&common);
+	printf("End T\n");
+	return NULL;
+}
+
+
+
+void Newton::compute_bMatgVec_sparse(InputData& inputData,
+				     Solutions& currentPt,
+				     Residuals& currentRes,
+				     AverageComplementarity& mu,
+				     DirectionParameter& beta,
+				     Phase& phase,
+				     ComputeTime& com)
+{
+  const int m = currentPt.mDim;
+  const int LP_nBlock    = inputData.LP_nBlock;
+  const double target_mu = beta.value * mu.current;
+  CholmodSpace& cholmodSpace = currentPt.cholmodSpace;
+  TimeStart(LP_START);
+  for (int l=0; l<LP_nBlock; ++l) {
+    const double xMat    = cholmodSpace.LP_X[l];
+    const double invzMat = cholmodSpace.LP_invZ[l];
+    const double rD      = cholmodSpace.LP_rD[l];
+    for (int index2 = 0; index2 < LP_nStartIndex2[l]; ++index2) {
+      const int start_iter = LP_startIndex2[l][index2];
+      const int end_iter   = LP_startIndex2[l][index2+1];
+      const int j  = LP_constraint2[l][start_iter];
+      const int jb = LP_blockIndex2[l][start_iter];
+      const double Aj = inputData.A[j].LP_sp_block[jb];
+      gVec.ele[j] += Aj * invzMat * ( xMat * rD  - target_mu);
+      for (int iter = start_iter; iter < end_iter; ++iter) {
+	const int i = LP_constraint1[l][iter];
+	const int ib = LP_blockIndex1[l][iter];
+	const double Ai = inputData.A[i].LP_sp_block[ib];
+	const double value = xMat * invzMat * Ai * Aj;
+	sparse_bMat.sp_ele[LP_location_sparse_bMat[l][iter]] += value;
+      }
+    } // end of 'for (int start_iter)'
+  } // end of 'for (int l)'
+  TimeEnd(LP_END);
+  com.B_DIAG += TimeCal(LP_START, LP_END);
+
+  TimeStart(SDP_START);
+  const int SDP_nBlock = inputData.SDP_nBlock;
+
+  for (int l=0; l<SDP_nBlock; ++l) {
+    CholmodMatrix& cholmodMatrix = cholmodSpace.SDP_block[l];
+    for (int index2 = 0; index2 < SDP_nStartIndex2[l]; ++index2) {
+      const int start_iter = SDP_startIndex2[l][index2];
+      const int end_iter   = SDP_startIndex2[l][index2+1];
+      int j  = SDP_constraint2[l][start_iter];
+      int jb = SDP_blockIndex2[l][start_iter];
+      CompMatrix& Aj = inputData.A[j].SDP_sp_block[jb];
+
+      const int nCol = Aj.nzColumn;
+      double gj1 = 0.0;
+      double gj2 = 0.0;
+      for (int k_index = 0; k_index < nCol; ++k_index) {
+	const int k = Aj.column_index[k_index];
+	cholmodMatrix.setB_Zzero();
+	double* b_z = (double*)(cholmodMatrix.b_z->x);
+	const int i_start = Aj.column_start[k_index];
+	const int i_end   = Aj.column_start[k_index+1];
+	for (int i_index = i_start; i_index < i_end; ++i_index) {
+	  const int i2 = Aj.row_index[i_index];
+	  b_z[i2] = Aj.ele[i_index];
+	}
+	cholmodMatrix.solveByZ();
+	cholmodMatrix.setB_Xzero();
+	double* b_x = (double*)(cholmodMatrix.b_x->x);
+	b_x[k] = 1.0;
+	cholmodMatrix.solveByX();
+	double* x_z = (double*)(cholmodMatrix.x_z->x);
+	double* x_x = (double*)(cholmodMatrix.x_x->x);
+
+#if 0
+	rMessage(" x_x = ");
+	CholmodMatrix::display_dense(cholmodMatrix.x_x);
+	rMessage(" x_z = ");
+	CholmodMatrix::display_dense(cholmodMatrix.x_z);
+	rMessage(" rD = ");
+	CholmodMatrix::display_sparse(cholmodMatrix.rD);
+#endif
+	
+	double gadd = 0.0;
+	if (phase.value == SolveInfo:: pFEAS
+	    || phase.value == SolveInfo::noINFO) {
+	  Lal::getInnerProduct(gadd, cholmodMatrix.rD, x_x, x_z);
+	  // rMessage("gj1 = " << gadd << " : gj2 = " << x_z[k] << " : k = " << k);
+	}
+	gj1 += gadd;
+	gj2 += x_z[k];
+
+	for (int iter = start_iter; iter < end_iter; ++iter) {
+	  int i = SDP_constraint1[l][iter];
+	  int ib = SDP_blockIndex1[l][iter];
+	  CompMatrix& Ai = inputData.A[i].SDP_sp_block[ib];
+	  double Badd = 0.0;
+	  Lal::getInnerProduct(Badd, Ai, x_x, x_z);
+	  sparse_bMat.sp_ele[SDP_location_sparse_bMat[l][iter]] += Badd;
+	}
+      } // end of 'for (int k_index)'
+      gVec.ele[j] += (gj1 - target_mu * gj2);
+    } // end of 'for (int j)'
+  }
+  TimeEnd(SDP_END);
+  com.B_F2 += TimeCal(SDP_START, SDP_END);
+}
+
+void Newton::compute_bMatgVec_sparse_threads(InputData& inputData,
+				    Solutions& currentPt,
+				    Residuals& currentRes,
+				    AverageComplementarity& mu,
+				    DirectionParameter& beta,
+				    Phase& phase,
+				    ComputeTime& com)
+{
+  const int m = currentPt.mDim;
+  const int LP_nBlock    = inputData.LP_nBlock;
+  const double target_mu = beta.value * mu.current;
+  CholmodSpace& cholmodSpace = currentPt.cholmodSpace;
+  TimeStart(LP_START);
+  for (int l=0; l<LP_nBlock; ++l) {
+    const double xMat    = cholmodSpace.LP_X[l];
+    const double invzMat = cholmodSpace.LP_invZ[l];
+    const double rD      = cholmodSpace.LP_rD[l];
+    for (int index2 = 0; index2 < LP_nStartIndex2[l]; ++index2) {
+      const int start_iter = LP_startIndex2[l][index2];
+      const int end_iter   = LP_startIndex2[l][index2+1];
+      const int j  = LP_constraint2[l][start_iter];
+      const int jb = LP_blockIndex2[l][start_iter];
+      const double Aj = inputData.A[j].LP_sp_block[jb];
+      gVec.ele[j] += Aj * invzMat * ( xMat * rD  - target_mu);
+      for (int iter = start_iter; iter < end_iter; ++iter) {
+	const int i = LP_constraint1[l][iter];
+	const int ib = LP_blockIndex1[l][iter];
+	const double Ai = inputData.A[i].LP_sp_block[ib];
+	const double value = xMat * invzMat * Ai * Aj;
+	sparse_bMat.sp_ele[LP_location_sparse_bMat[l][iter]] += value;
+      }
+    } // end of 'for (int start_iter)'
+  } // end of 'for (int l)'
+  TimeEnd(LP_END);
+  com.B_DIAG += TimeCal(LP_START, LP_END);
+
+  TimeStart(SDP_START);
+  const int SDP_nBlock = inputData.SDP_nBlock;
+
+  // To improve CHOLMOD performance, turn off BLAS parallel
+  blas_set_num_threads(1);
+  
+  int ret;
+  ret = pthread_mutex_init(&job_mutex, NULL);
+  if (ret  != 0) {
+    rError("pthread_mutex_init error");
+  }
+  pthread_t* handle;
+  NewArray(handle, pthread_t, NUM_THREADS);
+  thread_arg_s* targ;
+  NewArray(targ, thread_arg_s, NUM_THREADS);
+  
+  for (int thread_num = 0; thread_num < NUM_THREADS; ++thread_num) {
+    targ[thread_num].m                = m;
+    targ[thread_num].target_mu        = target_mu;
+    targ[thread_num].thread_num       = thread_num;
+    targ[thread_num].addr_inputData   = &inputData;
+    targ[thread_num].addr_sparse_bMat = &sparse_bMat;
+    targ[thread_num].addr_gVec        = &gVec;
+    targ[thread_num].addr_phase       = &phase;
+    targ[thread_num].addr_newton      = this;
+  }
+  for (int l=0; l<SDP_nBlock; ++l) {
+    CholmodMatrix& cholmodMatrix = cholmodSpace.SDP_block[l];
+    Column_Number = 0;
+    for (int thread_num = 0; thread_num < NUM_THREADS; ++ thread_num) {
+      targ[thread_num].l = l;
+      targ[thread_num].addr_cholmodMatrix = &cholmodMatrix;
+      pthread_create(&handle[thread_num], NULL,
+		     compute_bMatgVec_sparse_threads_SDP,
+		     (void *)&targ[thread_num]);
+    }
+    for (int thread_num = 0; thread_num < NUM_THREADS; ++ thread_num) {
+      pthread_join(handle[thread_num], NULL);
+    }
+  }
+
+  DeleteArray(handle);
+  DeleteArray(targ);
+  ret = pthread_mutex_destroy(&job_mutex);
+  if (ret != 0) {
+    rError("pthread_mutex_destroy error in sdpa_newton.cpp");
+  }
+  blas_set_num_threads(NUM_THREADS);
+  TimeEnd(SDP_END);
+  com.B_F2 += TimeCal(SDP_START, SDP_END);
+}
+    
+void* Newton::compute_bMatgVec_sparse_threads_SDP(void* arg) 
+{
+  thread_arg_s* targ = (thread_arg_s*) arg;
+  const int l =  targ->l;
+  const int m =  targ->m;
+  const double target_mu = targ->target_mu;
+  const int thread_num = targ->thread_num;
+  InputData&     inputData     = *(targ->addr_inputData);
+  CholmodMatrix& cholmodMatrix = *(targ->addr_cholmodMatrix);
+  SparseMatrix&  sparse_bMat   = *(targ->addr_sparse_bMat);
+  Vector&        gVec          = *(targ->addr_gVec);
+  Phase&         phase         = *(targ->addr_phase);
+  Newton&        newton        = *(targ->addr_newton);
+  CompSpace* A = inputData.A;
+  const int nDim = cholmodMatrix.nDim;
+  cholmod_common& common = cholmodMatrix.common;
+
+  int* SDP_constraint1l = newton.SDP_constraint1[l];
+  int* SDP_blockIndex1l = newton.SDP_blockIndex1[l];
+  int* SDP_constraint2l = newton.SDP_constraint2[l];
+  int* SDP_blockIndex2l = newton.SDP_blockIndex2[l];
+  int* SDP_startIndex2l = newton.SDP_startIndex2[l];
+  const int SDP_nStartIndex2l    = newton.SDP_nStartIndex2[l];
+  int* SDP_location_sparse_bMatl = newton.SDP_location_sparse_bMat[l];
+
+  // for multi-threads computing work space
+  cholmod_dense* b_x1;
+  cholmod_dense* b_z1;
+  b_x1 = cholmod_allocate_dense(nDim,1,nDim,CHOLMOD_REAL,
+				&common);
+  b_z1 = cholmod_allocate_dense(nDim,1,nDim,CHOLMOD_REAL,
+				&common);
+  double* b_x = (double*)(b_x1->x);
+  double* b_z = (double*)(b_z1->x);
+
+  cholmod_factor* Lz = cholmodMatrix.Lz;
+  cholmod_factor* Lx = cholmodMatrix.Lx;
+
+  int index2 = 0; // dummy initialize
+  while (1) {
+    pthread_mutex_lock(&job_mutex);
+    index2 = Column_Number++;
+    pthread_mutex_unlock(&job_mutex);
+    if (index2 >= SDP_nStartIndex2l) {
+      break;
+    }
+    const int start_iter = SDP_startIndex2l[index2];
+    const int end_iter   = SDP_startIndex2l[index2+1];
+    int j  = SDP_constraint2l[start_iter];
+    int jb = SDP_blockIndex2l[start_iter];
+    CompMatrix& Aj = A[j].SDP_sp_block[jb];
+
+    const int nCol = Aj.nzColumn;
+    double gj1 = 0.0;
+    double gj2 = 0.0;
+    for (int k_index = 0; k_index < nCol; ++k_index) {
+      const int k = Aj.column_index[k_index];
+      for (int index1=0;index1 < nDim; ++index1) {
+	b_z[index1] = 0.0;
+      }
+      const int i_start = Aj.column_start[k_index];
+      const int i_end   = Aj.column_start[k_index+1];
+      for (int i_index = i_start; i_index < i_end; ++i_index) {
+	const int i2 = Aj.row_index[i_index];
+	b_z[i2] = Aj.ele[i_index];
+      }
+      cholmod_dense* b2   = cholmod_solve(CHOLMOD_P, Lz, b_z1, &common);
+      cholmod_dense* x_z2 = cholmod_solve(CHOLMOD_LDLt, Lz, b2, &common);
+      cholmod_dense* x_z1 = cholmod_solve(CHOLMOD_Pt, Lz, x_z2, &common);
+      double* x_z = (double*)(x_z1->x);
+      for (int index1=0;index1 < nDim; ++index1) {
+	b_x[index1] = 0.0;
+      }
+      b_x[k] = 1.0;
+      
+      cholmod_dense* b_x2 = cholmod_solve(CHOLMOD_P, Lx, b_x1,  &common);
+      cholmod_dense* x_x3 = cholmod_solve(CHOLMOD_L, Lx, b_x2, &common);
+      cholmod_dense* x_x2 = cholmod_solve(CHOLMOD_Lt,Lx, x_x3, &common);
+      cholmod_dense* x_x1 = cholmod_solve(CHOLMOD_Pt, Lx, x_x2, &common);
+      double* x_x = (double*)(x_x1->x);
+
+#if 0
+      rMessage(" x_x = ");
+      CholmodMatrix::display_dense(cholmodMatrix.x_x);
+      rMessage(" x_z = ");
+      CholmodMatrix::display_dense(cholmodMatrix.x_z);
+      rMessage(" rD = ");
+      CholmodMatrix::display_sparse(cholmodMatrix.rD);
+#endif
+	
+      double gadd = 0.0;
+      if (phase.value == SolveInfo:: pFEAS
+	  || phase.value == SolveInfo::noINFO) {
+	Lal::getInnerProduct(gadd, cholmodMatrix.rD, x_x, x_z);
+	// rMessage("gj1 = " << gadd << " : gj2 = " << x_z[k] << " : k = " << k);
+      }
+      gj1 += gadd;
+      gj2 += x_z[k];
+
+      for (int iter = start_iter; iter < end_iter; ++iter) {
+	int i = SDP_constraint1l[iter];
+	int ib = SDP_blockIndex1l[iter];
+	CompMatrix& Ai = A[i].SDP_sp_block[ib];
+	double Badd = 0.0;
+	Lal::getInnerProduct(Badd, Ai, x_x, x_z);
+	sparse_bMat.sp_ele[SDP_location_sparse_bMatl[iter]] += Badd;
+      }
+      cholmod_free_dense(&b2,&common);
+      cholmod_free_dense(&x_z2,&common);
+      cholmod_free_dense(&b_x2,&common);
+      cholmod_free_dense(&x_x3,&common);
+      cholmod_free_dense(&x_x2,&common);      
+      cholmod_free_dense(&x_x1,&common);
+      cholmod_free_dense(&x_z1,&common);
+    } // end of 'for (int k_index)'
+    gVec.ele[j] += (gj1 - target_mu * gj2);
+  } // end of 'for (int index2, aka j)'
+  cholmod_free_dense(&b_x1,&common);
+  cholmod_free_dense(&b_z1,&common);
+  return NULL;
+}
+
+
+
+void Newton::Make_bMatgVec(InputData& inputData,
+			   Solutions& currentPt,
+			   Residuals& currentRes,
+			   AverageComplementarity& mu,
+			   DirectionParameter& beta,
+			   Phase& phase,
+			   ComputeTime& com)
+{
+  TimeStart(START3);
+  gVec.copyFrom(inputData.b);
+  if (bMat_type == SPARSE){
+    // set sparse_bMat zero
+    sdpa_dset(sparse_bMat.NonZeroCount, 0.0, sparse_bMat.sp_ele, 1);
+    #if 0
+    compute_bMatgVec_sparse(inputData, currentPt, currentRes, mu, beta,
+    			    phase, com);
+    #else
+    compute_bMatgVec_sparse_threads(inputData, currentPt, currentRes, mu, beta,
+    			    phase, com);
+    #endif
+  } else {
+    bMat.setZero();
+    #if 0
+    compute_bMatgVec_dense(inputData, currentPt, currentRes, mu, beta,
+			   phase, com);
+    #else
+    compute_bMatgVec_dense_threads(inputData, currentPt, currentRes, mu, beta,
+			   phase, com);
+    #endif
+  }
+  
+  #if 0
+  // display_index();
+  rMessage("bMat =  ");
+  if (bMat_type == DENSE) {
+    bMat.display();
+  }
+  else {
+    sparse_bMat.display();
+  }
+  rMessage("gVec = ");
+  gVec.display();
+  #endif
+  TimeEnd(END3);
+  com.makebMatgVec += TimeCal(START3,END3);
+}
+
+bool Newton::compute_DyVec(Newton::WHICH_DIRECTION direction,
+			   int m,
+			   InputData& inputData,
+			   Chordal& chordal,
+			   Solutions& currentPt,
+			   ComputeTime& com,
+			   FILE* Display, FILE* fpOut)
+{
+  if (direction == PREDICTOR) {
+    TimeStart(START3_2);
+    
+    if (bMat_type == SPARSE){
+      bool ret = chordal.factorizeSchur(m, diagonalIndex, Display, fpOut);
+      if (ret == SDPA_FAILURE) {
+	return SDPA_FAILURE;
+      }
+    } else {
+      bool ret = Lal::choleskyFactorWithAdjust(bMat);
+      if (ret == SDPA_FAILURE) {
+	return SDPA_FAILURE;
+      }
+    }
+    // rMessage("Cholesky of bMat =  ");
+    // bMat.display();
+    // sparse_bMat.display();
+    TimeEnd(END3_2);
+    com.choleskybMat += TimeCal(START3_2,END3_2);
+  }
+  // bMat is already cholesky factorized.
+
+
+  Vector& DyVec = currentPt.cholmodSpace.dyVec;
+  
+  TimeStart(START4);
+  if (bMat_type == SPARSE){
+    DyVec.copyFrom(gVec);
+    chordal.solveSchur(DyVec);
+  } else {
+    Lal::let(DyVec,'=',bMat,'/',gVec);
+  }
+  TimeEnd(END4);
+  com.solve += TimeCal(START4,END4);
+  // rMessage("DyVec =  ");
+  // DyVec.display();
+  return SDPA_SUCCESS;
+}
+
+void Newton::compute_DzMat(InputData& inputData,
+			   Solutions& currentPt,
+			   Residuals& currentRes,
+			   Phase& phase,
+			   ComputeTime& com)
+{
+  TimeStart(START_SUMDZ);
+  CholmodSpace& cholmodSpace = currentPt.cholmodSpace;
+  CompSpace* A = inputData.A;
+
+  for (int l=0; l<cholmodSpace.LP_nBlock; ++l) {
+    cholmodSpace.LP_dZ[l] = 0.0;
+  }
+  
+  for (int l=0; l<cholmodSpace.SDP_nBlock; ++l) {
+    cholmod_sparse* dZ = cholmodSpace.SDP_block[l].dZ;
+    const int length = dZ->nzmax;
+    for (int index1 = 0; index1 < length; ++index1) {
+      ((double*)(dZ->x))[index1] = 0.0;
+    }
+  }
+
+  const int m = cholmodSpace.dyVec.nDim;
+  for (int k=0; k<m; ++k) {
+    const double dyk = cholmodSpace.dyVec.ele[k];
+    for (int l_index=0; l_index < A[k].LP_sp_nBlock; ++l_index) {
+      const int l        = A[k].LP_sp_index[l_index];
+      const double value = A[k].LP_sp_block[l_index];
+      cholmodSpace.LP_dZ[l] -= dyk * value;
+    }
+    for (int l_index=0; l_index < A[k].SDP_sp_nBlock; ++l_index) {
+      const int l        = A[k].SDP_sp_index[l_index];
+      CompMatrix& Akl    = A[k].SDP_sp_block[l_index];
+      cholmod_sparse* dZ = cholmodSpace.SDP_block[l].dZ;
+      for (int j_index = 0; j_index<Akl.nzColumn; ++j_index) {
+	const int row_start = Akl.diag_index[j_index];
+	const int row_end   = Akl.column_start[j_index+1];
+	if (row_start == -1) {
+	  continue;
+	}
+	for (int i = row_start; i < row_end; ++i) {
+	  const int agg_index = Akl.agg_index[i];
+	  ((double*)(dZ->x))[agg_index] -= Akl.ele[i]*dyk;
+	}
+      }
+    }
+  } // end of 'for (int k=0; k<m; ++k)'
+      
+  if (phase.value == SolveInfo:: pFEAS
+      || phase.value == SolveInfo::noINFO) {
+    for (int l=0; l<cholmodSpace.LP_nBlock; ++l) {
+      cholmodSpace.LP_dZ[l] += cholmodSpace.LP_rD[l];
+    }
+    for (int l=0; l<cholmodSpace.SDP_nBlock; ++l) {
+      cholmod_sparse* dZ = cholmodSpace.SDP_block[l].dZ;
+      cholmod_sparse* rD = cholmodSpace.SDP_block[l].rD;
+      const int length = dZ->nzmax;
+      for (int index1 = 0; index1 < length; ++index1) {
+	((double*)(dZ->x))[index1] += ((double*)(rD->x))[index1];
+      }
+    }
+  }
+  TimeEnd(END_SUMDZ);
+  com.sumDz += TimeCal(START_SUMDZ,END_SUMDZ);
+}
+
+void Newton::compute_DxMat(Solutions& currentPt,
+			   AverageComplementarity& mu,
+			   DirectionParameter& beta,
+			   ComputeTime& com)
+{
+  TimeStart(START_DX);
+  CholmodSpace& cholmodSpace   = currentPt.cholmodSpace;
+  OrderingSpace& orderingSpace = currentPt.order;
+  const double target_mu = beta.value * mu.current;
+  for (int l=0; l<cholmodSpace.LP_nBlock; ++l) {
+    cholmodSpace.LP_dX[l] = target_mu * cholmodSpace.LP_invZ[l]
+      - cholmodSpace.LP_X[l]
+      - cholmodSpace.LP_X[l] * cholmodSpace.LP_dZ[l] * cholmodSpace.LP_invZ[l];
+  }
+  for (int l=0; l<cholmodSpace.SDP_nBlock; ++l) {
+    OrderingMatrix& order        = orderingSpace.SDP_block[l];
+    CholmodMatrix& cholmodMatrix = cholmodSpace.SDP_block[l];
+    CliqueMatrix& clique_xMat    = cholmodMatrix.clique_xMat;
+    CliqueMatrix& clique_dX      = cholmodMatrix.clique_dX;
+
+    for (int l2=0; l2<clique_xMat.nBlock; ++l2) {
+      Lal::multiply(clique_dX.ele[l2],clique_xMat.ele[l2],&DMONE);
+    }
+    
+    cholmod_factor* Lz = cholmodMatrix.Lz;
+    cholmod_factor* Lx = cholmodMatrix.Lx;
+    cholmod_sparse* dZ = cholmodMatrix.dZ;
+    cholmod_common& common = cholmodMatrix.common;
+    const int nDim = cholmodMatrix.nDim;
+
+    for (int k=0; k<nDim; ++k) {
+      cholmodMatrix.setB_Zzero();
+      double* b_z = (double*)(cholmodMatrix.b_z->x);
+      b_z[k] = 1.0;
+      cholmodMatrix.solveByZ();
+      double* x_z = (double*)(cholmodMatrix.x_z->x);
+      // x_z is now Zinv[*k]
+
+      double one[2] = {1.0, 0.0};
+      double zero[2] = {0.0, 0.0} ;
+      int transpose = 0; // 0 means no transpose
+      double* b_x = (double*)(cholmodMatrix.b_x->x);
+      cholmod_sdmult(dZ, transpose, one, zero, cholmodMatrix.x_z,
+		     cholmodMatrix.b_x, &common);
+      cholmodMatrix.solveByX();
+      double* x_x = (double*)(cholmodMatrix.x_x->x);
+
+      for (int i=0; i<nDim; ++i) {
+	x_x[i] = target_mu * x_z[i] + -x_x[i];
+      }
+
+      int dXtNonzeros = order.dXtNonzeros[k];
+      int* dXtIndex   = order.dXtIndex[k];
+      int* dXtClique  = order.dXtClique[k];
+      int* dXtBlock   = order.dXtBlock[k];
+      for (int index1 = 0; index1 < dXtNonzeros; ++index1) {
+	DenseMatrix& targetMatrix = clique_dX.ele[dXtClique[index1]];
+	targetMatrix.de_ele[dXtBlock[index1]] += x_x[dXtIndex[index1]];
+      }
+						    
+    } // end of 'for (int k=0; k<nDim; ++k)'
+  }  // end of 'for (int l=0; l<cholmodSpace.SDP_nBlock; ++l)'
+    
+  TimeEnd(END_DX);
+  TimeStart(START_SYMM);
+  for (int l=0; l<cholmodSpace.SDP_nBlock; ++l) {
+    CliqueMatrix& clique_dX = cholmodSpace.SDP_block[l].clique_dX;
+    for (int l2=0; l2<clique_dX.nBlock; ++l2) {
+      Lal::getSymmetrize(clique_dX.ele[l2]);
+    }
+  }
+  TimeEnd(END_SYMM);
+  com.makedX += TimeCal(START_DX,END_DX);
+  com.symmetriseDx += TimeCal(START_SYMM,END_SYMM);
+}
+
+
+void Newton::compute_DxMat_threads(Solutions& currentPt,
+			   AverageComplementarity& mu,
+			   DirectionParameter& beta,
+			   ComputeTime& com)
+{
+  TimeStart(START_DX);
+  CholmodSpace& cholmodSpace   = currentPt.cholmodSpace;
+  OrderingSpace& orderingSpace = currentPt.order;
+  const double target_mu = beta.value * mu.current;
+  for (int l=0; l<cholmodSpace.LP_nBlock; ++l) {
+    cholmodSpace.LP_dX[l] = target_mu * cholmodSpace.LP_invZ[l]
+      - cholmodSpace.LP_X[l]
+      - cholmodSpace.LP_X[l] * cholmodSpace.LP_dZ[l] * cholmodSpace.LP_invZ[l];
+  }
+
+  // To improve CHOLMOD performance, turn off BLAS parallel
+  blas_set_num_threads(1);
+  
+  int ret;
+  ret = pthread_mutex_init(&job_mutex, NULL);
+  if (ret  != 0) {
+    rError("pthread_mutex_init error");
+  }
+  
+  pthread_t* handle;
+  NewArray(handle, pthread_t, NUM_THREADS);
+  thread_DX_t* tDX;
+  NewArray(tDX, thread_DX_t, NUM_THREADS);
+  for (int thread_num = 0; thread_num < NUM_THREADS; ++thread_num) {
+    tDX[thread_num].target_mu   = target_mu;
+    tDX[thread_num].thread_num  = thread_num;
+  }
+  
+  for (int l=0; l<cholmodSpace.SDP_nBlock; ++l) {
+    OrderingMatrix& order        = orderingSpace.SDP_block[l];
+    CholmodMatrix& cholmodMatrix = cholmodSpace.SDP_block[l];
+    CliqueMatrix& clique_xMat    = cholmodMatrix.clique_xMat;
+    CliqueMatrix& clique_dX      = cholmodMatrix.clique_dX;
+
+    for (int l2=0; l2<clique_xMat.nBlock; ++l2) {
+      Lal::multiply(clique_dX.ele[l2],clique_xMat.ele[l2],&DMONE);
+    }
+    const int nDim = cholmodMatrix.nDim;
+    Column_NumberDx = 0;
+    for (int thread_num = 0; thread_num < NUM_THREADS; ++thread_num) {
+      tDX[thread_num].l = l;
+      tDX[thread_num].addr_cholmodMatrix = &cholmodMatrix;
+      tDX[thread_num].addr_order         = &order;
+      pthread_create(&handle[thread_num], NULL,
+		     compute_DxMat_threads_SDP,
+		     (void *)&tDX[thread_num]);
+    }
+
+    for (int thread_num = 0; thread_num < NUM_THREADS; ++thread_num) {
+      pthread_join(handle[thread_num], NULL);
+    }
+  }  // end of 'for (int l=0; l<cholmodSpace.SDP_nBlock; ++l)'
+  DeleteArray(handle);
+  DeleteArray(tDX);
+  ret = pthread_mutex_destroy(&job_mutex);
+  if (ret != 0) {
+    rError("pthread_mutex_destroy error in sdpa_newton.cpp");
+  }
+  blas_set_num_threads(NUM_THREADS);
+    
+  TimeEnd(END_DX);
+  TimeStart(START_SYMM);
+  for (int l=0; l<cholmodSpace.SDP_nBlock; ++l) {
+    CliqueMatrix& clique_dX = cholmodSpace.SDP_block[l].clique_dX;
+    for (int l2=0; l2<clique_dX.nBlock; ++l2) {
+      Lal::getSymmetrize(clique_dX.ele[l2]);
+    }
+  }
+  TimeEnd(END_SYMM);
+  com.makedX += TimeCal(START_DX,END_DX);
+  com.symmetriseDx += TimeCal(START_SYMM,END_SYMM);
+}
+
+
+void* Newton::compute_DxMat_threads_SDP(void* arg)
+{
+  thread_DX_t* tDX = (thread_DX_t*) arg;
+  const int       l             = tDX->l;
+  const int       thread_num    = tDX->thread_num;
+  const double    target_mu     = tDX->target_mu;
+  CholmodMatrix&  cholmodMatrix = *(tDX->addr_cholmodMatrix);
+  OrderingMatrix& order         = *(tDX->addr_order);
+
+  const int nDim = cholmodMatrix.nDim;
+  cholmod_factor* Lz = cholmodMatrix.Lz;
+  cholmod_factor* Lx = cholmodMatrix.Lx;
+  cholmod_sparse* dZ = cholmodMatrix.dZ;
+  cholmod_common& common = cholmodMatrix.common;
+  CliqueMatrix& clique_dX = cholmodMatrix.clique_dX;
+  
+  cholmod_dense* b_x1;
+  cholmod_dense* b_z1;
+  b_x1 = cholmod_allocate_dense(nDim,1,nDim,CHOLMOD_REAL,
+				&common);
+  b_z1 = cholmod_allocate_dense(nDim,1,nDim,CHOLMOD_REAL,
+				&common);
+  double* b_x = (double*)(b_x1->x);
+  double* b_z = (double*)(b_z1->x);
+
+  int k = 0; // dummy initialize
+  
+  while (1) {
+    pthread_mutex_lock(&job_mutex);
+    k = Column_NumberDx;
+    Column_NumberDx++;
+    pthread_mutex_unlock(&job_mutex);
+    if (k >= nDim) {
+      break;
+    }
+    for (int index1=0;index1 < nDim; ++index1) {
+      b_z[index1] = 0.0;
+    }
+    b_z[k] = 1.0;
+    cholmod_dense* b2   = cholmod_solve(CHOLMOD_P, Lz, b_z1, &common);
+    cholmod_dense* x_z2 = cholmod_solve(CHOLMOD_LDLt, Lz, b2, &common);
+    cholmod_dense* x_z1 = cholmod_solve(CHOLMOD_Pt, Lz, x_z2, &common);
+    double* x_z = (double*)(x_z1->x);
+
+    // x_z is now Zinv[*k]
+
+    double one[2] = {1.0, 0.0};
+    double zero[2] = {0.0, 0.0} ;
+    int transpose = 0; // 0 means no transpose
+    cholmod_sdmult(dZ, transpose, one, zero, x_z1,
+		   b_x1, &common);
+
+    cholmod_dense* b_x2 = cholmod_solve(CHOLMOD_P, Lx, b_x1,  &common);
+    cholmod_dense* x_x3 = cholmod_solve(CHOLMOD_L, Lx, b_x2, &common);
+    cholmod_dense* x_x2 = cholmod_solve(CHOLMOD_Lt,Lx, x_x3, &common);
+    cholmod_dense* x_x1 = cholmod_solve(CHOLMOD_Pt, Lx, x_x2, &common);
+    double* x_x = (double*)(x_x1->x);
+    
+
+    for (int i=0; i<nDim; ++i) {
+      x_x[i] = target_mu * x_z[i] + -x_x[i];
+    }
+
+    int dXtNonzeros = order.dXtNonzeros[k];
+    int* dXtIndex   = order.dXtIndex[k];
+    int* dXtClique  = order.dXtClique[k];
+    int* dXtBlock   = order.dXtBlock[k];
+    for (int index1 = 0; index1 < dXtNonzeros; ++index1) {
+      DenseMatrix& targetMatrix = clique_dX.ele[dXtClique[index1]];
+      targetMatrix.de_ele[dXtBlock[index1]] += x_x[dXtIndex[index1]];
+    }
+						    
+    cholmod_free_dense(&b2,&common);
+    cholmod_free_dense(&x_z2,&common);
+    cholmod_free_dense(&x_z1,&common);
+    cholmod_free_dense(&b_x2,&common);
+    cholmod_free_dense(&x_x3,&common);
+    cholmod_free_dense(&x_x2,&common);
+    cholmod_free_dense(&x_x1,&common);
+  } // end of 'for (int k=0; k<nDim; ++k)'
+  cholmod_free_dense(&b_x1,&common);
+  cholmod_free_dense(&b_z1,&common);
+  return NULL;
+}
+
+
+
+bool Newton::Mehrotra(Newton::WHICH_DIRECTION direction,
+		      int m,
+		      InputData& inputData,
+		      Chordal& chordal,
+		      Solutions& currentPt,
+		      Residuals& currentRes,
+		      AverageComplementarity& mu,
+		      DirectionParameter& beta,
+		      Switch& reduction,
+		      Phase& phase,
+		      ComputeTime& com,
+		      FILE* Display, FILE* fpOut)
+{
+  // rMessage("BEFORE CHOLESKY");  currentPt.display();
+  
+  if (direction == PREDICTOR) {
+    if (currentPt.cholmodSpace.getCholesky(currentPt.order) == SDPA_FAILURE) {
+      return SDPA_FAILURE;
+    }
+  }
+  // rMessage("AFTER CHOLESKY");  currentPt.display();
+
+  if (direction == PREDICTOR) {
+    Make_bMatgVec(inputData, currentPt, currentRes, mu,
+		  beta, phase, com);
+  }
+
+  bool ret = compute_DyVec(direction,
+			   m, inputData, chordal,
+			   currentPt, com, Display, fpOut);
+  if (ret == SDPA_FAILURE) {
+    return SDPA_FAILURE;
+  }
+  // rMessage("dy = ");
+  // currentPt.cholmodSpace.dyVec.display();
+  compute_DzMat(inputData, currentPt, currentRes, phase, com);
+  TimeStart(START5);
+  #if 0
+  compute_DxMat(currentPt, mu, beta, com);
+  #else
+  compute_DxMat_threads(currentPt, mu, beta, com);
+  #endif
+  TimeEnd(END5);
+  com.makedXdZ += TimeCal(START5,END5);
+  // rMessage("After Schur currentPt"); currentPt.display();
+  return true;
+}
+
+void Newton::checkDirection(int m, InputData& inputData,
+			    Solutions& currentPt,
+			    Residuals& currentRes,
+			    AverageComplementarity& mu,
+			    DirectionParameter& beta,
+			    Switch& reduction,
+			    Phase& phase,
+			    ComputeTime& com,
+			    FILE* Display, FILE* fpOut)
+{  // Check the direction
+  rMessage("Checking Directions");
+  CholmodSpace&  cholmodSpace = currentPt.cholmodSpace;
+  OrderingSpace& order        = currentPt.order;
+  CompSpace& C = inputData.C;
+  CompSpace* A = inputData.A;
+  Vector&    b = inputData.b;
+
+  // Check [A \bullet (X+dX) == b]
+  double* rp;
+  NewArray(rp, double, m);
+  for (int k=0; k<m; ++k) {
+    double ip = 0.0; // dummy initialize
+    cholmodSpace.getInnerProductAX(ip, A[k], order);
+    double dip = 0.0; // dummy initialize
+    cholmodSpace.getInnerProductAdX(dip, A[k], order);
+    rp[k] = b.ele[k] - ip - dip;
+  }
+  double sum = 0.0;
+  for (int k=0; k<m; ++k) {
+    sum += rp[k]*rp[k];
+  }
+  double norm_rp = sqrt(sum);
+  DeleteArray(rp);
+  // printf("norm_dp = %e\n", norm_dp);
+
+  // Check [I == clique_invCholeskyX^T * clique_xMat * clique_invCholeskyX]
+  sum = 0.0;
+  for (int l=0; l<cholmodSpace.LP_nBlock; ++l) {
+    double diff = cholmodSpace.LP_X[l]*cholmodSpace.LP_invX[l] - 1.0;
+    sum += diff*diff;
+  }
+  for (int l=0; l<cholmodSpace.SDP_nBlock; ++l) {
+    CholmodMatrix& cholmodMatrix = cholmodSpace.SDP_block[l];
+    for (int l2=0; l2<cholmodMatrix.clique_xMat.nBlock; ++l2) {
+      DenseMatrix& xMat = cholmodMatrix.clique_xMat.ele[l2];
+      DenseMatrix& invCholeskyX = cholmodMatrix.clique_invCholeskyX.ele[l2];
+      DenseMatrix multi1;
+      multi1.initialize(xMat.nRow, xMat.nCol);
+      Lal::tran_multiply(multi1,invCholeskyX,xMat);
+      DenseMatrix multi2;
+      multi2.initialize(xMat.nRow, xMat.nCol);
+      Lal::multiply(multi2, multi1, invCholeskyX);
+      #if 0
+      rMessage("xMat         = "); xMat.display();
+      rMessage("invCholeskyX = "); invCholeskyX.display();
+      rMessage("multi1       = "); multi1.display();
+      rMessage("multi2       = "); multi2.display();
+      #endif
+      double* ele = multi2.de_ele;
+      for (int i=0; i<xMat.nRow; ++i) {
+	ele[i+xMat.nRow*i] -= 1.0;
+      }
+      for (int i=0; i<xMat.nRow*xMat.nCol; ++i) {
+	sum += ele[i]*ele[i];
+      }
+    }
+  }
+  double norm_invX = sqrt(sum);
+  // printf("norm_invX = %e\n", norm_invX);
+  
+  // Check [C - A^(y+dy) - (Z+dZ) == O]
+  sum = 0.0;
+  double* rDl;
+  NewArray(rDl, double, cholmodSpace.LP_nBlock);
+  for (int l=0; l < cholmodSpace.LP_nBlock; ++l) {
+    rDl[l] = 0.0;
+  }
+  
+  for (int l_index = 0; l_index < C.LP_sp_nBlock; ++l_index) {
+    const int l        = C.LP_sp_index[l_index];
+    const double value = C.LP_sp_block[l_index];
+    rDl[l] = value;
+  }
+  for (int k=0; k<m; ++k) {
+    const double yk = cholmodSpace.yVec.ele[k]; 
+    const double dyk = cholmodSpace.dyVec.ele[k]; 
+    for (int l_index = 0; l_index < A[k].LP_sp_nBlock; ++l_index) {
+      const int l        = A[k].LP_sp_index[l_index];
+      const double value = A[k].LP_sp_block[l_index];
+      rDl[l] -= value*(yk+dyk);
+    }
+  }
+  for (int l=0; l<cholmodSpace.LP_nBlock; ++l) {
+    rDl[l] -= (cholmodSpace.LP_Z[l] + cholmodSpace.LP_dZ[l]);
+  }
+  for (int l=0; l<cholmodSpace.LP_nBlock; ++l) {
+    sum += rDl[l]*rDl[l];
+  }
+
+
+  cholmod_sparse** rDs;
+  NewArray(rDs, cholmod_sparse*, cholmodSpace.SDP_nBlock);
+  for (int l=0; l<cholmodSpace.SDP_nBlock; ++l) {
+    CholmodMatrix& cholmodMatrix = cholmodSpace.SDP_block[l];
+    cholmod_sparse* Z = cholmodMatrix.Z;
+    cholmod_sparse* dZ = cholmodMatrix.dZ;
+    rDs[l] = cholmod_copy_sparse(Z, &cholmodMatrix.common);
+    const int length = Z->nzmax;
+    for (int index1 = 0; index1 < length; ++index1) {
+      ((double*)(rDs[l]->x))[index1] = -((double*)(Z->x))[index1] -((double*)(dZ->x))[index1];
+    }
+  }
+  for (int l_index=0; l_index < C.SDP_sp_nBlock; ++l_index) {
+    const int    l = C.SDP_sp_index[l_index];
+    CompMatrix& Cl = C.SDP_sp_block[l_index];
+    cholmod_sparse* rD = rDs[l];
+    for (int j_index = 0; j_index<Cl.nzColumn; ++j_index) {
+      const int row_start = Cl.diag_index[j_index];
+      const int row_end   = Cl.column_start[j_index+1];
+      if (row_start == -1) {
+	continue;
+      }
+      for (int i = row_start; i < row_end; ++i) {
+	int agg_index = Cl.agg_index[i];
+	((double*)(rD->x))[agg_index] += Cl.ele[i];
+      }
+    }
+  }
+  for (int k=0; k<m; ++k) {
+    const double yk = cholmodSpace.yVec.ele[k]; 
+    const double dyk = cholmodSpace.dyVec.ele[k]; 
+    for (int l_index=0; l_index < A[k].SDP_sp_nBlock; ++l_index) {
+      const int     l = A[k].SDP_sp_index[l_index];
+      CompMatrix& Akl = A[k].SDP_sp_block[l_index];
+      cholmod_sparse* rD = rDs[l];
+      for (int j_index = 0; j_index<Akl.nzColumn; ++j_index) {
+	const int row_start = Akl.diag_index[j_index];
+	const int row_end   = Akl.column_start[j_index+1];
+	if (row_start == -1) {
+	  continue;
+	}
+	for (int i = row_start; i < row_end; ++i) {
+	  int agg_index = Akl.agg_index[i];
+	  ((double*)(rD->x))[agg_index] -= Akl.ele[i]*(yk+dyk);
+	}
+      }
+    }
+  }
+  for (int l=0; l<cholmodSpace.SDP_nBlock; ++l) {
+    const int length = rDs[l]->nzmax;
+    for (int index1 = 0; index1 < length; ++index1) {
+      double value = ((double*)(rDs[l]->x))[index1];
+      sum += value*value;
+    }
+  }
+  for (int l=0; l<cholmodSpace.SDP_nBlock; ++l) {
+    cholmod_free_sparse(&rDs[l], &cholmodSpace.SDP_block[l].common);
+  }
+  DeleteArray(rDs);
+  double norm_rD = sqrt(sum);
+  // printf("norm_rD = %e\n", norm_rD);
+
+  const double target_mu = beta.value * mu.current;
+
+  // Check [(X+dX)*(dZ+Z) - mu*I == O]
+  sum = 0.0;
+  double min_X = 1.0e+50;
+  double min_Z = 1.0e+50;
+
+  for (int l=0; l<cholmodSpace.LP_nBlock; ++l) {
+    double  X = cholmodSpace.LP_X[l];
+    double dX = cholmodSpace.LP_dX[l];
+    double  Z = cholmodSpace.LP_Z[l];
+    double dZ = cholmodSpace.LP_dZ[l];
+    double diff = (X+dX)*(dZ+Z) - target_mu * 1;
+    // rMessage("diff = " << diff);
+    sum += diff*diff;
+    if (X < min_X) {
+      min_X = X;
+    }
+    if (Z < min_Z) {
+      min_Z = Z;
+    }
+  }
+
+  for (int l=0; l<cholmodSpace.SDP_nBlock; ++l) {
+    CholmodMatrix& cholmodMatrix = cholmodSpace.SDP_block[l];
+    int nDim = cholmodMatrix.nDim;
+    cholmod_sparse*  Z = cholmodMatrix.Z;
+    cholmod_sparse* dZ = cholmodMatrix.dZ;
+    DenseMatrix D_Z;
+    DenseMatrix D_ZdZ;
+    D_Z.initialize(nDim, nDim);
+    D_Z.setZero();
+    D_ZdZ.initialize(nDim, nDim);
+    D_ZdZ.setZero();
+
+    DenseMatrix D_Result;
+    D_Result.initialize(nDim, nDim);
+    D_Result.setZero();
+    
+    const int ncol = (int) Z->ncol;
+    for (int j=0; j < ncol; ++j) {
+      const int start_row = ((int*)Z->p)[j];
+      const int end_row   = ((int*)Z->p)[j+1];
+      for (int i_index = start_row; i_index < end_row; ++i_index) {
+	const int    i     = ((   int*)Z->i)[i_index];
+	const double value = ((double*)Z->x)[i_index];
+	const double dvalue = ((double*)dZ->x)[i_index];
+	D_Z.de_ele[i+j*nDim] = value;
+	D_Z.de_ele[j+i*nDim] = value;
+	D_ZdZ.de_ele[i+j*nDim] = value+dvalue;
+	D_ZdZ.de_ele[j+i*nDim] = value+dvalue;
+      }
+    }
+  }
+  rMessage("This routine is only check");
+  rMessage("This routine is not implemented fully");
+  rMessage("The computation of R is not implemented fully");
+  double norm_R = sqrt(sum);
+
+  rMessage("inv(X)*X = inv(Z)*Z = I?");
+
+  sum = 0.0;
+  for (int k=0; k<m; ++k) {
+    const double yk = cholmodSpace.yVec.ele[k];
+    sum += yk*yk;
+  }
+  double norm_y = sqrt(sum);
+  
+  sum = 0.0;
+  for (int k=0; k<m; ++k) {
+    const double dyk = cholmodSpace.dyVec.ele[k];
+    sum += dyk*dyk;
+  }
+  double norm_dy = sqrt(sum);
+
+  printf("norm :: rp=%.2e, invX=%.2e, rD=%.2e, R=%.2e\n",
+	 norm_rp, norm_invX, norm_rD, norm_R);
+  // printf("min_X = %.2e, min_Z = %.2e\n", min_X, min_Z);
+  // printf("norm :: y=%.2e, dy=%.2e\n", norm_y, norm_dy);
+}
+
+  
+
+void Newton::display(FILE* fpout)
+{
+  if (fpout == NULL) {
+    return;
+  }
+
+  rMessage("This function is not implemented in SDPA-C");
+  #if 0
+  fprintf(fpout,"rNewton.DxMat = \n");
+  DxMat.display(fpout);
+  fprintf(fpout,"rNewton.DyVec = \n");
+  DyVec.display(fpout);
+  fprintf(fpout,"rNewton.DzMat = \n");
+  DzMat.display(fpout);
+  #endif
+}
+
+void Newton::display_index(FILE* fpout)
+{
+  if (fpout == NULL) {
+    return;
+  }
+  printf("display_index: %d %d\n",SDP_nBlock,LP_nBlock);
+
+  for (int l=0; l<SDP_nBlock; l++){
+    printf("SDP:%dth block\n",l);
+    for (int k=0; k<SDP_number[l]; k++){
+      printf("SDP(i=%d,ib=%d; j=%d,jb=%d) for target = %d\n",
+	     SDP_constraint1[l][k],SDP_blockIndex1[l][k],
+	     SDP_constraint2[l][k],SDP_blockIndex2[l][k], 
+	     SDP_location_sparse_bMat[l][k]);
+    }
+  }
+
+  for (int l=0; l<LP_nBlock; l++){
+    printf("LP:%dth block\n",l);
+    for (int k=0; k<LP_number[l]; k++){
+      printf("LP(i=%d,ib=%d; j=%d,jb=%d) for target = %d\n",
+	     LP_constraint1[l][k],LP_blockIndex1[l][k],
+	     LP_constraint2[l][k],LP_blockIndex2[l][k], 
+	     LP_location_sparse_bMat[l][k]);
+    }
+
+  }
+
+}
+
+void Newton::display_sparse_bMat(FILE* fpout)
+{
+  if (fpout == NULL) {
+    return;
+  }
+  fprintf(fpout,"{\n");
+  for (int index=0; index<sparse_bMat.NonZeroCount; ++index) {
+    int i        = sparse_bMat.row_index[index];
+    int j        = sparse_bMat.column_index[index];
+    double value = sparse_bMat.sp_ele[index];
+    fprintf(fpout,"val[%d,%d] = %e\n", i,j,value);
+  }
+  fprintf(fpout,"}\n");
+}
+
+} // end of namespace 'sdpa'
+
diff --git a/sdpa-c/sdpa-c-bug/sdpa_newton_fix.cpp b/sdpa-c/sdpa-c-bug/sdpa_newton_fix.cpp
new file mode 100644
index 0000000000000000000000000000000000000000..51a29c04df577d335d2d46033303283b4abcd91c
--- /dev/null
+++ b/sdpa-c/sdpa-c-bug/sdpa_newton_fix.cpp
@@ -0,0 +1,1890 @@
+/* -------------------------------------------------------------
+
+This file is a component of SDPA
+Copyright (C) 2004-2013 SDPA Project
+
+This program is free software; you can redistribute it and/or modify
+it under the terms of the GNU General Public License as published by
+the Free Software Foundation; either version 2 of the License, or
+(at your option) any later version.
+
+This program is distributed in the hope that it will be useful,
+but WITHOUT ANY WARRANTY; without even the implied warranty of
+MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+GNU General Public License for more details.
+
+You should have received a copy of the GNU General Public License
+along with this program; if not, write to the Free Software
+Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA  02111-1307 USA
+
+------------------------------------------------------------- */
+
+#include "sdpa_newton.h"
+#include "sdpa_parts.h"
+#include "sdpa_linear.h"  
+#include "sdpa_algebra.h"
+
+namespace sdpa {
+
+pthread_mutex_t Newton::job_mutex = PTHREAD_MUTEX_INITIALIZER;
+int Newton::Column_Number   = 0;
+int Newton::Column_NumberDx = 0;
+
+
+Newton::Newton()
+{
+  // Caution: if SDPA doesn't use sparse bMat, 
+  //          following variables are indefinite.
+  this->SDP_nBlock = -1;
+  SDP_number = NULL;  SDP_location_sparse_bMat = NULL;
+  SDP_constraint1 = NULL;  SDP_constraint2 = NULL;
+  SDP_blockIndex1 = NULL;  SDP_blockIndex2 = NULL;
+  this->LP_nBlock = -1;
+  LP_number = NULL;  LP_location_sparse_bMat = NULL;
+  LP_constraint1 = NULL;  LP_constraint2 = NULL;
+  LP_blockIndex1 = NULL;  LP_blockIndex2 = NULL;
+
+  diagonalIndex = NULL;
+  NUM_THREADS  = 1;
+}
+
+Newton::Newton(int m, BlockStruct& bs)
+{
+  initialize(m, bs);
+}
+
+Newton::~Newton()
+{
+  finalize();
+}
+
+void Newton::initialize(int m, BlockStruct& bs)
+{
+  gVec.initialize(m);
+
+  SDP_nBlock  = bs.SDP_nBlock;
+  LP_nBlock   = bs.LP_nBlock;
+  
+  bMat_type = DENSE;
+  // Caution: if SDPA doesn't use sparse bMat, 
+  //          following variables are indefinite.
+  this->SDP_nBlock = -1;
+  SDP_number = NULL;  SDP_location_sparse_bMat = NULL;
+  SDP_constraint1 = NULL;  SDP_constraint2 = NULL;
+  SDP_blockIndex1 = NULL;  SDP_blockIndex2 = NULL;
+  this->LP_nBlock = -1;
+  LP_number = NULL;  LP_location_sparse_bMat = NULL;
+  LP_constraint1 = NULL;  LP_constraint2 = NULL;
+  LP_blockIndex1 = NULL;  LP_blockIndex2 = NULL;
+
+  diagonalIndex = NULL;
+}
+
+void Newton::finalize()
+{
+
+  if (bMat_type == SPARSE){
+
+    if (SDP_location_sparse_bMat && SDP_constraint1 && SDP_constraint2
+	&& SDP_blockIndex1 && SDP_blockIndex2) {
+      for (int l=0; l<SDP_nBlock; ++l) {
+	DeleteArray(SDP_location_sparse_bMat[l]);
+	DeleteArray(SDP_constraint1[l]);
+	DeleteArray(SDP_constraint2[l]);
+	DeleteArray(SDP_blockIndex1[l]);
+	DeleteArray(SDP_blockIndex2[l]);
+	DeleteArray(SDP_startIndex2[l]);
+      }
+      DeleteArray(SDP_number);
+      DeleteArray(SDP_location_sparse_bMat);
+      DeleteArray(SDP_constraint1);
+      DeleteArray(SDP_constraint2);
+      DeleteArray(SDP_blockIndex1);
+      DeleteArray(SDP_blockIndex2);
+      DeleteArray(SDP_startIndex2);
+      DeleteArray(SDP_nStartIndex2);
+    }
+    if (LP_location_sparse_bMat && LP_constraint1 && LP_constraint2
+	&& LP_blockIndex1 && LP_blockIndex2) {
+      for (int l=0; l<LP_nBlock; ++l) {
+	DeleteArray(LP_location_sparse_bMat[l]);
+	DeleteArray(LP_constraint1[l]);
+	DeleteArray(LP_constraint2[l]);
+	DeleteArray(LP_blockIndex1[l]);
+	DeleteArray(LP_blockIndex2[l]);
+	DeleteArray(LP_startIndex2[l]);
+      }
+      DeleteArray(LP_number);
+      DeleteArray(LP_location_sparse_bMat);
+      DeleteArray(LP_constraint1);
+      DeleteArray(LP_constraint2);
+      DeleteArray(LP_blockIndex1);
+      DeleteArray(LP_blockIndex2);
+      DeleteArray(LP_startIndex2);
+      DeleteArray(LP_nStartIndex2);
+    }
+
+    DeleteArray(diagonalIndex);
+    sparse_bMat.finalize();
+
+  } else { // bMat_type == DENSE
+    bMat.finalize();
+  }
+
+  int m = gVec.nDim;
+  gVec.finalize();
+}
+
+void Newton::initialize_dense_bMat(int m)
+{
+  //  bMat_type = DENSE;
+  //  printf("DENSE computations\n");
+  bMat.initialize(m,m);
+}
+
+  // 2008/03/12 kazuhide nakata
+void Newton::initialize_sparse_bMat(int m)
+{
+
+  //  bMat_type = SPARSE;
+  //  printf("SPARSE computation\n");
+
+  // initialize sparse_bMat by Chordal::makeGraph
+  //  sparse_bMat.display();
+
+  bool isEmptyMatrix = false;
+  // make index of diagonalIndex
+  NewArray(diagonalIndex,int,m+1);
+  int k=0;
+  for (int index=0; index<sparse_bMat.NonZeroCount; index++){
+    if (sparse_bMat.row_index[index] == sparse_bMat.column_index[index]) {
+      diagonalIndex[k] = index;
+      if (sparse_bMat.row_index[index] != k+1) {
+	rMessage("The matrix [" << (sparse_bMat.row_index[index]-1)
+		 << "] is empty");
+	isEmptyMatrix = true;
+	diagonalIndex[k+1] = diagonalIndex[k];
+	k++;
+      }
+      k++;
+    }
+  }
+  if (isEmptyMatrix) {
+    rMessage("Input Data Error :: Some Input Matricies are Empty");
+  }
+    
+  diagonalIndex[m] = sparse_bMat.NonZeroCount;
+  #if 0
+  rMessage("diagonalIndex = ");
+  for (int index=0; index <m; ++index) {
+    printf(" [%d:%d]",index,diagonalIndex[index]);
+  }
+  printf("\n");
+  #endif
+}
+
+  // 2008/03/12 kazuhide nakata
+void Newton::initialize_bMat(int m, Chordal& chordal,
+			     InputData& inputData,
+			     FILE* Display,
+                             FILE* fpOut)
+{
+  /* Create clique tree */
+
+  switch (chordal.best) {
+  case SELECT_DENSE: 
+    bMat_type = DENSE;
+    if (Display) {
+      fprintf(Display,"Schur computation : DENSE \n");
+    }
+    if (fpOut) {
+      fprintf(fpOut,"Schur computation : DENSE \n");
+    }
+    initialize_dense_bMat(m);
+    // Here, we release MUMPS and sparse_bMat
+    chordal.finalize();
+    break;
+  case SELECT_MUMPS_BEST: 
+    bMat_type = SPARSE;
+    if (Display) {
+      fprintf(Display,"Schur computation : SPARSE \n");
+    }
+    if (fpOut) {
+      fprintf(fpOut,"Schur computation : SPARSE \n");
+    }
+    initialize_sparse_bMat(m);
+    make_aggrigateIndex(inputData);
+    break;
+  default: 
+    rError("Wrong Ordering Obtained");
+    break;
+  }
+
+}
+
+int Newton::binarySearchIndex(int i, int j)
+{
+  // binary search for index of sparse_bMat 
+  int t = -1;
+  // We store only the lower triangular
+  int ii = i, jj = j;
+  if (i<j) {
+    jj = i;
+    ii = j;
+  }
+  int begin  = diagonalIndex[jj]; 
+  int end    = diagonalIndex[jj+1]-1;
+  int target = (begin + end) / 2;
+  while (end - begin > 1){
+    if (sparse_bMat.row_index[target] < ii+1){
+      begin = target;
+      target = (begin + end) / 2;
+    } else if (sparse_bMat.row_index[target] > ii+1){
+      end = target;
+      target = (begin + end) / 2;
+    } else if (sparse_bMat.row_index[target] == ii+1) {
+      t = target;
+      break;
+    }
+  }
+  if (t == -1){
+    if (sparse_bMat.row_index[begin] == ii+1){
+      t = begin;
+    } else if (sparse_bMat.row_index[end] == ii+1){
+      t = end;
+    } else {
+      #if 0
+      int m = sparse_bMat.nRow;
+      rMessage("Trouble ii = " << ii << " jj = " << j <<  " m = " << m);
+      for (int k = 0; k<sparse_bMat.NonZeroCount; ++k) {
+	fprintf(stdout,"[%04d,%04d] at %04d\n",
+		sparse_bMat.row_index[k],
+		sparse_bMat.column_index[k],
+		k);
+      }
+      #endif
+      // rError("Newton::make_aggrigateIndex program bug");
+    }
+  } 
+  return t;
+}
+  
+
+void Newton::make_aggrigateIndex_SDP(InputData& inputData)
+{
+
+  SDP_nBlock = inputData.SDP_nBlock;
+  NewArray(SDP_number,int,SDP_nBlock);
+
+  // memory allocate for aggrigateIndex
+  NewArray(SDP_constraint1,int*,SDP_nBlock);
+  NewArray(SDP_constraint2,int*,SDP_nBlock);
+  NewArray(SDP_blockIndex1,int*,SDP_nBlock);
+  NewArray(SDP_blockIndex2,int*,SDP_nBlock);
+  NewArray(SDP_location_sparse_bMat,int*,SDP_nBlock);
+  NewArray(SDP_nStartIndex2, int, SDP_nBlock);
+  NewArray(SDP_startIndex2, int*, SDP_nBlock);
+
+  for (int l=0; l<SDP_nBlock; l++){
+    const int size = (inputData.SDP_nConstraint[l] + 1) 
+      * inputData.SDP_nConstraint[l] / 2;
+    SDP_number[l] = size;
+    NewArray(SDP_constraint1[l],int,size);
+    NewArray(SDP_constraint2[l],int,size);
+    NewArray(SDP_blockIndex1[l],int,size);
+    NewArray(SDP_blockIndex2[l],int,size);
+    NewArray(SDP_location_sparse_bMat[l],int,size);
+  }
+
+  for (int l = 0; l<SDP_nBlock; l++){
+    int NonZeroCount = 0;
+    vector<int> startTmp;
+
+    for (int k1=0; k1<inputData.SDP_nConstraint[l]; k1++){
+      int j = inputData.SDP_constraint[l][k1];
+      int jb = inputData.SDP_blockIndex[l][k1];
+      startTmp.push_back(NonZeroCount);
+
+      for (int k2=0; k2<inputData.SDP_nConstraint[l]; k2++){
+	int i = inputData.SDP_constraint[l][k2];
+	int ib = inputData.SDP_blockIndex[l][k2];
+
+	if (i < j) {
+	  continue;
+	}
+	// set index which A_i and A_j are not zero matrix
+	int target = binarySearchIndex(i,j);
+	if (target == -1) {
+	  rMessage("("<<(i+1)<<","<<(j+1)<<") might have index");
+	  SDP_number[l]--;
+	  continue;
+	}
+	SDP_constraint1[l][NonZeroCount] = i;
+	SDP_constraint2[l][NonZeroCount] = j;
+	SDP_blockIndex1[l][NonZeroCount] = ib;
+	SDP_blockIndex2[l][NonZeroCount] = jb;
+	SDP_location_sparse_bMat[l][NonZeroCount] = target;
+	NonZeroCount++;
+      }
+    } // for k1
+    // The last element to stop the array
+    startTmp.push_back(NonZeroCount);
+    const int startTmp_size = startTmp.size();
+    SDP_nStartIndex2[l] = startTmp_size-1; // the last is the stopper
+    NewArray(SDP_startIndex2[l], int, startTmp_size);
+    for (int index1 = 0; index1 < startTmp_size; ++index1) {
+      SDP_startIndex2[l][index1] = startTmp[index1];
+    }
+    #if 0
+    rMessage("SDP_startIndex["<<l
+	     <<"][" << startTmp_size << "] = ");
+    for (int index1 = 0; index1 < startTmp_size; ++index1) {
+      printf("%d->%d ", index1,startTmp[index1]);
+    }
+    #endif
+  } //for l  lth block
+}
+
+void Newton::make_aggrigateIndex_LP(InputData& inputData)
+{
+  LP_nBlock = inputData.LP_nBlock;
+  NewArray(LP_number,int,LP_nBlock);
+
+  // memory allocate for aggrigateIndex
+  NewArray(LP_constraint1,int*,LP_nBlock);
+  NewArray(LP_constraint2,int*,LP_nBlock);
+  NewArray(LP_blockIndex1,int*,LP_nBlock);
+  NewArray(LP_blockIndex2,int*,LP_nBlock);
+  NewArray(LP_location_sparse_bMat,int*,LP_nBlock);
+  NewArray(LP_nStartIndex2, int, LP_nBlock);
+  NewArray(LP_startIndex2, int*, LP_nBlock);
+
+  for (int l=0; l<LP_nBlock; l++){
+    int size = (inputData.LP_nConstraint[l]+1)
+      * inputData.LP_nConstraint[l]/2;
+    LP_number[l] = size;
+    NewArray(LP_constraint1[l],int,size);
+    NewArray(LP_constraint2[l],int,size);
+    NewArray(LP_blockIndex1[l],int,size);
+    NewArray(LP_blockIndex2[l],int,size);
+    NewArray(LP_location_sparse_bMat[l],int,size);
+  }
+
+  for (int l = 0; l<LP_nBlock; l++){
+    int NonZeroCount = 0;
+    vector<int> startTmp;
+
+    for (int k1=0; k1<inputData.LP_nConstraint[l]; k1++){
+      int j = inputData.LP_constraint[l][k1];
+      int jb = inputData.LP_blockIndex[l][k1];
+      startTmp.push_back(NonZeroCount);
+
+      for (int k2=0; k2<inputData.LP_nConstraint[l]; k2++){
+	int i = inputData.LP_constraint[l][k2];
+	int ib = inputData.LP_blockIndex[l][k2];
+
+	if (i < j){
+	  continue;
+	}
+
+	// set index which A_i and A_j are not zero matrix
+	LP_constraint1[l][NonZeroCount] = i;
+	LP_constraint2[l][NonZeroCount] = j;
+	LP_blockIndex1[l][NonZeroCount] = ib;
+	LP_blockIndex2[l][NonZeroCount] = jb;
+	
+	int target = binarySearchIndex(i,j);
+	LP_location_sparse_bMat[l][NonZeroCount] = target;
+	NonZeroCount++;
+      }
+    } // for k1
+    // The last element to stop the array
+    startTmp.push_back(NonZeroCount);
+    const int startTmp_size = startTmp.size();
+    LP_nStartIndex2[l] = startTmp_size-1; // the last is the stopper
+    NewArray(LP_startIndex2[l], int, startTmp_size);
+    for (int index1 = 0; index1 < startTmp_size; ++index1) {
+      LP_startIndex2[l][index1] = startTmp[index1];
+    }
+  } //for l  lth block
+}
+
+void Newton::make_aggrigateIndex(InputData& inputData)
+{
+  make_aggrigateIndex_SDP(inputData);
+  make_aggrigateIndex_LP(inputData);
+}
+
+
+void Newton::setNumThreads(FILE* Display, FILE* fpOut, int NumThreads)
+{
+  if (NumThreads == 0) { // Automatic from OMP_NUM_THREADS
+    char* env1      = NULL;
+    env1 = getenv("OMP_NUM_THREADS");
+    if (env1 != NULL) {
+      NUM_THREADS = atoi(env1);
+      if (NUM_THREADS < 0) {
+	rError("OMP_NUM_THREADS is negative!!");
+      }
+    }
+    else {
+      NUM_THREADS = 1;
+    }
+  }
+  else {
+    NUM_THREADS = NumThreads;
+  }
+  if (Display) {
+    fprintf(Display,"NumThreads  is set as %d\n", NUM_THREADS);
+  }
+  if (fpOut) {
+    fprintf(fpOut,  "NumThreads  is set as %d\n", NUM_THREADS);
+  }
+  // This affects for only OpenBLAS
+  blas_set_num_threads(NUM_THREADS);
+}
+
+void Newton::compute_bMatgVec_dense(InputData& inputData,
+				    Solutions& currentPt,
+				    Residuals& currentRes,
+				    AverageComplementarity& mu,
+				    DirectionParameter& beta,
+				    Phase& phase,
+				    ComputeTime& com)
+{
+  const int m = currentPt.mDim;
+  const int LP_nBlock    = inputData.LP_nBlock;
+  const double target_mu = beta.value * mu.current;
+  // rMessage("mu.current = " << mu.current);
+  // rMessage("target_mu = " << target_mu);
+  CholmodSpace& cholmodSpace = currentPt.cholmodSpace;
+  // rMessage("cholmodSpace = "); cholmodSpace.display();
+  TimeStart(LP_START);
+  for (int l=0; l<LP_nBlock; ++l) {
+    const double xMat    = cholmodSpace.LP_X[l];
+    const double invzMat = cholmodSpace.LP_invZ[l];
+    const double rD      = cholmodSpace.LP_rD[l];
+    for (int k1=0; k1<inputData.LP_nConstraint[l]; k1++) {
+      const int j = inputData.LP_constraint[l][k1];
+      const int jb = inputData.LP_blockIndex[l][k1];
+      const double Aj = inputData.A[j].LP_sp_block[jb];
+      const double xMatInvzMatAj = xMat * invzMat * Aj;
+      for (int k2=k1; k2<inputData.LP_nConstraint[l]; k2++) {
+	int i = inputData.LP_constraint[l][k2];
+	int ib = inputData.LP_blockIndex[l][k2];
+	double Ai = inputData.A[i].LP_sp_block[ib];
+	double value =  xMatInvzMatAj * Ai;
+	// only lower triangular will be computed
+	// rMessage("add to (" << i << " , " << j << ")");
+	bMat.de_ele[i+m*j] += value;
+      } // end of 'for (int i)'
+      double gadd = 0.0;
+      if (phase.value == SolveInfo:: pFEAS
+	  || phase.value == SolveInfo::noINFO) {
+	gadd = xMat * rD;
+      }
+      gVec.ele[j] += Aj * invzMat * (gadd - target_mu);
+    } // end of 'for (int j)'
+  } // end of 'for (int l)'
+  TimeEnd(LP_END);
+  com.B_DIAG += TimeCal(LP_START, LP_END);
+
+  TimeStart(SDP_START);
+  const int SDP_nBlock = inputData.SDP_nBlock;
+
+  for (int l=0; l<SDP_nBlock; ++l) {
+    CholmodMatrix& cholmodMatrix = cholmodSpace.SDP_block[l];
+    for (int k1=0; k1<inputData.SDP_nConstraint[l]; k1++) {
+      int j = inputData.SDP_constraint[l][k1];
+      int jb = inputData.SDP_blockIndex[l][k1];
+      // rMessage("j = " << j << " : jb = " << jb);
+      CompMatrix& Aj = inputData.A[j].SDP_sp_block[jb];
+
+      const int nCol = Aj.nzColumn;
+      double gj1 = 0.0;
+      double gj2 = 0.0;
+      for (int k_index = 0; k_index < nCol; ++k_index) {
+	const int k = Aj.column_index[k_index];
+	// rMessage("k = " << k << ", k_index = " << k_index);
+	// rMessage("j = " << j << " : jb = " << jb);
+	cholmodMatrix.setB_Zzero();
+	double* b_z = (double*)(cholmodMatrix.b_z->x);
+	const int i_start = Aj.column_start[k_index];
+	const int i_end   = Aj.column_start[k_index+1];
+	for (int i_index = i_start; i_index < i_end; ++i_index) {
+	  const int i2 = Aj.row_index[i_index];
+	  b_z[i2] = Aj.ele[i_index];
+	}
+	cholmodMatrix.solveByZ();
+	cholmodMatrix.setB_Xzero();
+	double* b_x = (double*)(cholmodMatrix.b_x->x);
+	b_x[k] = 1.0;
+	cholmodMatrix.solveByX();
+	double* x_z = (double*)(cholmodMatrix.x_z->x);
+	double* x_x = (double*)(cholmodMatrix.x_x->x);
+
+	#if 0
+	rMessage("Aj = "); Aj.display();
+	rMessage(" b_x = ");
+	CholmodMatrix::display_dense(cholmodMatrix.b_x);
+	rMessage(" x_x = ");
+	CholmodMatrix::display_dense(cholmodMatrix.x_x);
+	rMessage(" b_z = ");
+	CholmodMatrix::display_dense(cholmodMatrix.b_z);
+	rMessage(" x_z = ");
+	CholmodMatrix::display_dense(cholmodMatrix.x_z);
+	// rMessage(" rD = ");
+	// CholmodMatrix::display_sparse(cholmodMatrix.rD);
+	#endif
+	
+	double gadd = 0.0;
+	if (phase.value == SolveInfo:: pFEAS
+	    || phase.value == SolveInfo::noINFO) {
+	  Lal::getInnerProduct(gadd, cholmodMatrix.rD, x_x, x_z);
+	}
+	// rMessage("gj1 = " << gadd << " : gj2 = " << x_z[k] << " : k = " << k);
+	gj1 += gadd;
+	gj2 += x_z[k];
+	for (int k2=k1; k2<inputData.SDP_nConstraint[l]; k2++) {
+	  int i = inputData.SDP_constraint[l][k2];
+	  int ib = inputData.SDP_blockIndex[l][k2];
+	  CompMatrix& Ai = inputData.A[i].SDP_sp_block[ib];
+	  double Badd = 0.0;
+	  Lal::getInnerProduct(Badd, Ai, x_x, x_z);
+	  bMat.de_ele[i+j*m] += Badd;
+	  #if 0
+	  rMessage("i=" << i << ", j=" << j
+		   << ", k=" << k << ", Badd=" << Badd );
+	  rMessage(" b_x = ");
+	  CholmodMatrix::display_dense(cholmodMatrix.b_x);
+	  rMessage(" x_x = ");
+	  CholmodMatrix::display_dense(cholmodMatrix.x_x);
+	  rMessage(" b_z = ");
+	  CholmodMatrix::display_dense(cholmodMatrix.b_z);
+	  rMessage(" x_z = ");
+	  CholmodMatrix::display_dense(cholmodMatrix.x_z);
+	  rMessage(" Ai  = "); Ai.display();
+	  #endif
+	}
+      } // end of 'for (int k_index = 0; k_index < nCol; ++k_index)'
+      // rMessage("target_mu = " << target_mu);
+      gVec.ele[j] += (gj1 - target_mu * gj2);
+    } // end of 'for (int k1)'
+  } // end of 'for (int l)'
+
+  TimeEnd(SDP_END);
+  com.B_F2 += TimeCal(SDP_START, SDP_END);
+}
+
+void Newton::compute_bMatgVec_dense_threads(InputData& inputData,
+				    Solutions& currentPt,
+				    Residuals& currentRes,
+				    AverageComplementarity& mu,
+				    DirectionParameter& beta,
+				    Phase& phase,
+				    ComputeTime& com)
+{
+  const int m = currentPt.mDim;
+  const int LP_nBlock    = inputData.LP_nBlock;
+  const double target_mu = beta.value * mu.current;
+  // rMessage("mu.current = " << mu.current);
+  // rMessage("target_mu = " << target_mu);
+  CholmodSpace& cholmodSpace = currentPt.cholmodSpace;
+  // rMessage("cholmodSpace = "); cholmodSpace.display();
+  TimeStart(LP_START);
+  for (int l=0; l<LP_nBlock; ++l) {
+    const double xMat    = cholmodSpace.LP_X[l];
+    const double invzMat = cholmodSpace.LP_invZ[l];
+    const double rD      = cholmodSpace.LP_rD[l];
+    for (int k1=0; k1<inputData.LP_nConstraint[l]; k1++) {
+      const int j = inputData.LP_constraint[l][k1];
+      const int jb = inputData.LP_blockIndex[l][k1];
+      const double Aj = inputData.A[j].LP_sp_block[jb];
+      const double xMatInvzMatAj = xMat * invzMat * Aj;
+      for (int k2=k1; k2<inputData.LP_nConstraint[l]; k2++) {
+	int i = inputData.LP_constraint[l][k2];
+	int ib = inputData.LP_blockIndex[l][k2];
+	double Ai = inputData.A[i].LP_sp_block[ib];
+	double value =  xMatInvzMatAj * Ai;
+	// only lower triangular will be computed
+	// rMessage("add to (" << i << " , " << j << ")");
+	bMat.de_ele[i+m*j] += value;
+      } // end of 'for (int i)'
+      double gadd = 0.0;
+      if (phase.value == SolveInfo:: pFEAS
+	  || phase.value == SolveInfo::noINFO) {
+	gadd = xMat * rD;
+      }
+      gVec.ele[j] += Aj * invzMat * (gadd - target_mu);
+    } // end of 'for (int j)'
+  } // end of 'for (int l)'
+  TimeEnd(LP_END);
+  com.B_DIAG += TimeCal(LP_START, LP_END);
+
+  TimeStart(SDP_START);
+  const int SDP_nBlock = inputData.SDP_nBlock;
+
+  // To improve CHOLMOD performance, turn off BLAS parallel
+  blas_set_num_threads(1);
+  
+  int ret;
+  ret = pthread_mutex_init(&job_mutex, NULL);
+  if (ret  != 0) {
+    rError("pthread_mutex_init error");
+  }
+  pthread_t* handle;
+  NewArray(handle, pthread_t, NUM_THREADS);
+  thread_arg_t* targ;
+  NewArray(targ, thread_arg_t, NUM_THREADS);
+  
+  for (int thread_num = 0; thread_num < NUM_THREADS; ++thread_num) {
+    targ[thread_num].m              = m;
+    targ[thread_num].target_mu      = target_mu;
+    targ[thread_num].thread_num     = thread_num;
+    targ[thread_num].addr_inputData = &inputData;
+    targ[thread_num].addr_bMat      = &bMat;
+    targ[thread_num].addr_gVec      = &gVec;
+    targ[thread_num].addr_phase     = &phase;
+  }
+  for (int l=0; l<SDP_nBlock; ++l) {
+    CholmodMatrix& cholmodMatrix = cholmodSpace.SDP_block[l];
+    Column_Number = 0;
+    for (int thread_num = 0; thread_num < NUM_THREADS; ++ thread_num) {
+      targ[thread_num].l = l;
+      targ[thread_num].addr_cholmodMatrix = &cholmodMatrix;
+      pthread_create(&handle[thread_num], NULL,
+		     compute_bMatgVec_dense_threads_SDP,
+		     (void *)&targ[thread_num]);
+    }
+    for (int thread_num = 0; thread_num < NUM_THREADS; ++ thread_num) {
+      pthread_join(handle[thread_num], NULL);
+    }
+  }
+
+  DeleteArray(handle);
+  DeleteArray(targ);
+  ret = pthread_mutex_destroy(&job_mutex);
+  if (ret != 0) {
+    rError("pthread_mutex_destroy error in sdpa_newton.cpp");
+  }
+  blas_set_num_threads(NUM_THREADS);
+  TimeEnd(SDP_END);
+  com.B_F2 += TimeCal(SDP_START, SDP_END);
+}
+    
+void* Newton::compute_bMatgVec_dense_threads_SDP(void* arg) 
+{
+  thread_arg_t* targ = (thread_arg_t*) arg;
+  const int l =  targ->l;
+  const int m =  targ->m;
+  const double target_mu = targ->target_mu;
+  const int thread_num = targ->thread_num;
+  InputData&     inputData     = *(targ->addr_inputData);
+  CholmodMatrix& cholmodMatrix = *(targ->addr_cholmodMatrix);
+  DenseMatrix&   bMat          = *(targ->addr_bMat);
+  Vector&        gVec          = *(targ->addr_gVec);
+  Phase&         phase         = *(targ->addr_phase);
+  CompSpace* A = inputData.A;
+  const int nDim = cholmodMatrix.nDim;
+  cholmod_common& common = cholmodMatrix.common;
+
+  const int SDP_nConstraintl = inputData.SDP_nConstraint[l];
+  int* SDP_constraintl = inputData.SDP_constraint[l];
+  int* SDP_blockIndexl = inputData.SDP_blockIndex[l];
+
+
+  // for multi-threads computing work space
+  cholmod_dense* b_x1;
+  cholmod_dense* b_z1;
+  b_x1 = cholmod_allocate_dense(nDim,1,nDim,CHOLMOD_REAL,
+				&common);
+  b_z1 = cholmod_allocate_dense(nDim,1,nDim,CHOLMOD_REAL,
+				&common);
+  double* b_x = (double*)(b_x1->x);
+  double* b_z = (double*)(b_z1->x);
+
+  cholmod_factor* Lz = cholmodMatrix.Lz;
+  cholmod_factor* Lx = cholmodMatrix.Lx;
+  int k1 = 0; // dummy initialize
+  while (1) {
+    pthread_mutex_lock(&job_mutex);
+    k1 = Column_Number;
+    Column_Number++;
+    pthread_mutex_unlock(&job_mutex);
+    if (k1>=SDP_nConstraintl) {
+      break;
+    }
+
+    int j = SDP_constraintl[k1];
+    int jb = SDP_blockIndexl[k1];
+    // rMessage("j = " << j << " : jb = " << jb);
+    CompMatrix& Aj = A[j].SDP_sp_block[jb];
+
+    const int nCol = Aj.nzColumn;
+    double gj1 = 0.0;
+    double gj2 = 0.0;
+    for (int k_index = 0; k_index < nCol; ++k_index) {
+      const int k = Aj.column_index[k_index];
+      // rMessage("k = " << k << ", k_index = " << k_index);
+      // rMessage("j = " << j << " : jb = " << jb);
+      for (int index1=0;index1 < nDim; ++index1) {
+	b_z[index1] = 0.0;
+      }
+      const int i_start = Aj.column_start[k_index];
+      const int i_end   = Aj.column_start[k_index+1];
+      for (int i_index = i_start; i_index < i_end; ++i_index) {
+	const int i2 = Aj.row_index[i_index];
+	b_z[i2] = Aj.ele[i_index];
+      }
+
+      cholmod_dense* b2   = cholmod_solve(CHOLMOD_P, Lz, b_z1, &common);
+      cholmod_dense* x_z2 = cholmod_solve(CHOLMOD_LDLt, Lz, b2, &common);
+      cholmod_dense* x_z1 = cholmod_solve(CHOLMOD_Pt, Lz, x_z2, &common);
+      double* x_z = (double*)(x_z1->x);
+      
+      for (int index1=0;index1 < nDim; ++index1) {
+	b_x[index1] = 0.0;
+      }
+      b_x[k] = 1.0;
+      
+      cholmod_dense* b_x2 = cholmod_solve(CHOLMOD_P, Lx, b_x1,  &common);
+      cholmod_dense* x_x3 = cholmod_solve(CHOLMOD_L, Lx, b_x2, &common);
+      cholmod_dense* x_x2 = cholmod_solve(CHOLMOD_Lt,Lx, x_x3, &common);
+      cholmod_dense* x_x1 = cholmod_solve(CHOLMOD_Pt, Lx, x_x2, &common);
+      double* x_x = (double*)(x_x1->x);
+#if 0
+      rMessage("Aj = "); Aj.display();
+      rMessage(" b_x = ");
+      CholmodMatrix::display_dense(cholmodMatrix.b_x1);
+      rMessage(" x_x = ");
+      CholmodMatrix::display_dense(cholmodMatrix.x_x1);
+      rMessage(" b_z = ");
+      CholmodMatrix::display_dense(cholmodMatrix.b_z1);
+      rMessage(" x_z = ");
+      CholmodMatrix::display_dense(cholmodMatrix.x_z1);
+      // rMessage(" rD = ");
+      // CholmodMatrix::display_sparse(cholmodMatrix.rD);
+#endif
+	
+      double gadd = 0.0;
+      if (phase.value == SolveInfo::pFEAS
+	  || phase.value == SolveInfo::noINFO) {
+	Lal::getInnerProduct(gadd, cholmodMatrix.rD, x_x, x_z);
+      }
+      // rMessage("gj1 = " << gadd << " : gj2 = " << x_z[k] << " : k = " << k);
+      gj1 += gadd;
+      gj2 += x_z[k];
+      for (int k2=k1; k2<inputData.SDP_nConstraint[l]; k2++) {
+	int i = SDP_constraintl[k2];
+	int ib = SDP_blockIndexl[k2];
+	CompMatrix& Ai = A[i].SDP_sp_block[ib];
+	double Badd = 0.0;
+	Lal::getInnerProduct(Badd, Ai, x_x, x_z);
+	bMat.de_ele[i+j*m] += Badd;
+#if 0
+	rMessage("i=" << i << ", j=" << j
+		 << ", k=" << k << ", Badd=" << Badd );
+	rMessage(" b_x = ");
+	CholmodMatrix::display_dense(cholmodMatrix.b_x1);
+	rMessage(" x_x = ");
+	CholmodMatrix::display_dense(cholmodMatrix.x_x1);
+	rMessage(" b_z = ");
+	CholmodMatrix::display_dense(cholmodMatrix.b_z1);
+	rMessage(" x_z = ");
+	CholmodMatrix::display_dense(cholmodMatrix.x_z1);
+	rMessage(" Ai  = "); Ai.display();
+#endif
+      }
+      cholmod_free_dense(&b2,&common);
+      cholmod_free_dense(&x_z2,&common);
+      cholmod_free_dense(&b_x2,&common);
+      cholmod_free_dense(&x_x3,&common);
+      cholmod_free_dense(&x_x2,&common);      
+      cholmod_free_dense(&x_x1,&common);
+      cholmod_free_dense(&x_z1,&common);
+    } // end of 'for (int k_index = 0; k_index < nCol; ++k_index)'
+    // rMessage("target_mu = " << target_mu);
+    gVec.ele[j] += (gj1 - target_mu * gj2);
+  } // end of 'for (int k1)'
+  cholmod_free_dense(&b_x1,&common);
+  cholmod_free_dense(&b_z1,&common);
+  return NULL;
+}
+
+
+
+void Newton::compute_bMatgVec_sparse(InputData& inputData,
+				     Solutions& currentPt,
+				     Residuals& currentRes,
+				     AverageComplementarity& mu,
+				     DirectionParameter& beta,
+				     Phase& phase,
+				     ComputeTime& com)
+{
+  const int m = currentPt.mDim;
+  const int LP_nBlock    = inputData.LP_nBlock;
+  const double target_mu = beta.value * mu.current;
+  CholmodSpace& cholmodSpace = currentPt.cholmodSpace;
+  TimeStart(LP_START);
+  for (int l=0; l<LP_nBlock; ++l) {
+    const double xMat    = cholmodSpace.LP_X[l];
+    const double invzMat = cholmodSpace.LP_invZ[l];
+    const double rD      = cholmodSpace.LP_rD[l];
+    for (int index2 = 0; index2 < LP_nStartIndex2[l]; ++index2) {
+      const int start_iter = LP_startIndex2[l][index2];
+      const int end_iter   = LP_startIndex2[l][index2+1];
+      const int j  = LP_constraint2[l][start_iter];
+      const int jb = LP_blockIndex2[l][start_iter];
+      const double Aj = inputData.A[j].LP_sp_block[jb];
+      gVec.ele[j] += Aj * invzMat * ( xMat * rD  - target_mu);
+      for (int iter = start_iter; iter < end_iter; ++iter) {
+	const int i = LP_constraint1[l][iter];
+	const int ib = LP_blockIndex1[l][iter];
+	const double Ai = inputData.A[i].LP_sp_block[ib];
+	const double value = xMat * invzMat * Ai * Aj;
+	sparse_bMat.sp_ele[LP_location_sparse_bMat[l][iter]] += value;
+      }
+    } // end of 'for (int start_iter)'
+  } // end of 'for (int l)'
+  TimeEnd(LP_END);
+  com.B_DIAG += TimeCal(LP_START, LP_END);
+
+  TimeStart(SDP_START);
+  const int SDP_nBlock = inputData.SDP_nBlock;
+
+  for (int l=0; l<SDP_nBlock; ++l) {
+    CholmodMatrix& cholmodMatrix = cholmodSpace.SDP_block[l];
+    for (int index2 = 0; index2 < SDP_nStartIndex2[l]; ++index2) {
+      const int start_iter = SDP_startIndex2[l][index2];
+      const int end_iter   = SDP_startIndex2[l][index2+1];
+      int j  = SDP_constraint2[l][start_iter];
+      int jb = SDP_blockIndex2[l][start_iter];
+      CompMatrix& Aj = inputData.A[j].SDP_sp_block[jb];
+
+      const int nCol = Aj.nzColumn;
+      double gj1 = 0.0;
+      double gj2 = 0.0;
+      for (int k_index = 0; k_index < nCol; ++k_index) {
+	const int k = Aj.column_index[k_index];
+	cholmodMatrix.setB_Zzero();
+	double* b_z = (double*)(cholmodMatrix.b_z->x);
+	const int i_start = Aj.column_start[k_index];
+	const int i_end   = Aj.column_start[k_index+1];
+	for (int i_index = i_start; i_index < i_end; ++i_index) {
+	  const int i2 = Aj.row_index[i_index];
+	  b_z[i2] = Aj.ele[i_index];
+	}
+	cholmodMatrix.solveByZ();
+	cholmodMatrix.setB_Xzero();
+	double* b_x = (double*)(cholmodMatrix.b_x->x);
+	b_x[k] = 1.0;
+	cholmodMatrix.solveByX();
+	double* x_z = (double*)(cholmodMatrix.x_z->x);
+	double* x_x = (double*)(cholmodMatrix.x_x->x);
+
+#if 0
+	rMessage(" x_x = ");
+	CholmodMatrix::display_dense(cholmodMatrix.x_x);
+	rMessage(" x_z = ");
+	CholmodMatrix::display_dense(cholmodMatrix.x_z);
+	rMessage(" rD = ");
+	CholmodMatrix::display_sparse(cholmodMatrix.rD);
+#endif
+	
+	double gadd = 0.0;
+	if (phase.value == SolveInfo:: pFEAS
+	    || phase.value == SolveInfo::noINFO) {
+	  Lal::getInnerProduct(gadd, cholmodMatrix.rD, x_x, x_z);
+	  // rMessage("gj1 = " << gadd << " : gj2 = " << x_z[k] << " : k = " << k);
+	}
+	gj1 += gadd;
+	gj2 += x_z[k];
+
+	for (int iter = start_iter; iter < end_iter; ++iter) {
+	  int i = SDP_constraint1[l][iter];
+	  int ib = SDP_blockIndex1[l][iter];
+	  CompMatrix& Ai = inputData.A[i].SDP_sp_block[ib];
+	  double Badd = 0.0;
+	  Lal::getInnerProduct(Badd, Ai, x_x, x_z);
+	  sparse_bMat.sp_ele[SDP_location_sparse_bMat[l][iter]] += Badd;
+	}
+      } // end of 'for (int k_index)'
+      gVec.ele[j] += (gj1 - target_mu * gj2);
+    } // end of 'for (int j)'
+  }
+  TimeEnd(SDP_END);
+  com.B_F2 += TimeCal(SDP_START, SDP_END);
+}
+
+void Newton::compute_bMatgVec_sparse_threads(InputData& inputData,
+				    Solutions& currentPt,
+				    Residuals& currentRes,
+				    AverageComplementarity& mu,
+				    DirectionParameter& beta,
+				    Phase& phase,
+				    ComputeTime& com)
+{
+  const int m = currentPt.mDim;
+  const int LP_nBlock    = inputData.LP_nBlock;
+  const double target_mu = beta.value * mu.current;
+  CholmodSpace& cholmodSpace = currentPt.cholmodSpace;
+  TimeStart(LP_START);
+  for (int l=0; l<LP_nBlock; ++l) {
+    const double xMat    = cholmodSpace.LP_X[l];
+    const double invzMat = cholmodSpace.LP_invZ[l];
+    const double rD      = cholmodSpace.LP_rD[l];
+    for (int index2 = 0; index2 < LP_nStartIndex2[l]; ++index2) {
+      const int start_iter = LP_startIndex2[l][index2];
+      const int end_iter   = LP_startIndex2[l][index2+1];
+      const int j  = LP_constraint2[l][start_iter];
+      const int jb = LP_blockIndex2[l][start_iter];
+      const double Aj = inputData.A[j].LP_sp_block[jb];
+      gVec.ele[j] += Aj * invzMat * ( xMat * rD  - target_mu);
+      for (int iter = start_iter; iter < end_iter; ++iter) {
+	const int i = LP_constraint1[l][iter];
+	const int ib = LP_blockIndex1[l][iter];
+	const double Ai = inputData.A[i].LP_sp_block[ib];
+	const double value = xMat * invzMat * Ai * Aj;
+	sparse_bMat.sp_ele[LP_location_sparse_bMat[l][iter]] += value;
+      }
+    } // end of 'for (int start_iter)'
+  } // end of 'for (int l)'
+  TimeEnd(LP_END);
+  com.B_DIAG += TimeCal(LP_START, LP_END);
+
+  TimeStart(SDP_START);
+  const int SDP_nBlock = inputData.SDP_nBlock;
+
+  // To improve CHOLMOD performance, turn off BLAS parallel
+  blas_set_num_threads(1);
+  
+  int ret;
+  ret = pthread_mutex_init(&job_mutex, NULL);
+  if (ret  != 0) {
+    rError("pthread_mutex_init error");
+  }
+  pthread_t* handle;
+  NewArray(handle, pthread_t, NUM_THREADS);
+  thread_arg_s* targ;
+  NewArray(targ, thread_arg_s, NUM_THREADS);
+  
+  for (int thread_num = 0; thread_num < NUM_THREADS; ++thread_num) {
+    targ[thread_num].m                = m;
+    targ[thread_num].target_mu        = target_mu;
+    targ[thread_num].thread_num       = thread_num;
+    targ[thread_num].addr_inputData   = &inputData;
+    targ[thread_num].addr_sparse_bMat = &sparse_bMat;
+    targ[thread_num].addr_gVec        = &gVec;
+    targ[thread_num].addr_phase       = &phase;
+    targ[thread_num].addr_newton      = this;
+  }
+  for (int l=0; l<SDP_nBlock; ++l) {
+    CholmodMatrix& cholmodMatrix = cholmodSpace.SDP_block[l];
+    Column_Number = 0;
+    for (int thread_num = 0; thread_num < NUM_THREADS; ++ thread_num) {
+      targ[thread_num].l = l;
+      targ[thread_num].addr_cholmodMatrix = &cholmodMatrix;
+      pthread_create(&handle[thread_num], NULL,
+		     compute_bMatgVec_sparse_threads_SDP,
+		     (void *)&targ[thread_num]);
+    }
+    for (int thread_num = 0; thread_num < NUM_THREADS; ++ thread_num) {
+      pthread_join(handle[thread_num], NULL);
+    }
+  }
+
+  DeleteArray(handle);
+  DeleteArray(targ);
+  ret = pthread_mutex_destroy(&job_mutex);
+  if (ret != 0) {
+    rError("pthread_mutex_destroy error in sdpa_newton.cpp");
+  }
+  blas_set_num_threads(NUM_THREADS);
+  TimeEnd(SDP_END);
+  com.B_F2 += TimeCal(SDP_START, SDP_END);
+}
+    
+void* Newton::compute_bMatgVec_sparse_threads_SDP(void* arg) 
+{
+  thread_arg_s* targ = (thread_arg_s*) arg;
+  const int l =  targ->l;
+  const int m =  targ->m;
+  const double target_mu = targ->target_mu;
+  const int thread_num = targ->thread_num;
+  InputData&     inputData     = *(targ->addr_inputData);
+  CholmodMatrix& cholmodMatrix = *(targ->addr_cholmodMatrix);
+  SparseMatrix&  sparse_bMat   = *(targ->addr_sparse_bMat);
+  Vector&        gVec          = *(targ->addr_gVec);
+  Phase&         phase         = *(targ->addr_phase);
+  Newton&        newton        = *(targ->addr_newton);
+  CompSpace* A = inputData.A;
+  const int nDim = cholmodMatrix.nDim;
+  cholmod_common& common = cholmodMatrix.common;
+
+  int* SDP_constraint1l = newton.SDP_constraint1[l];
+  int* SDP_blockIndex1l = newton.SDP_blockIndex1[l];
+  int* SDP_constraint2l = newton.SDP_constraint2[l];
+  int* SDP_blockIndex2l = newton.SDP_blockIndex2[l];
+  int* SDP_startIndex2l = newton.SDP_startIndex2[l];
+  const int SDP_nStartIndex2l    = newton.SDP_nStartIndex2[l];
+  int* SDP_location_sparse_bMatl = newton.SDP_location_sparse_bMat[l];
+
+  // for multi-threads computing work space
+  cholmod_dense* b_x1;
+  cholmod_dense* b_z1;
+  b_x1 = cholmod_allocate_dense(nDim,1,nDim,CHOLMOD_REAL,
+				&common);
+  b_z1 = cholmod_allocate_dense(nDim,1,nDim,CHOLMOD_REAL,
+				&common);
+  double* b_x = (double*)(b_x1->x);
+  double* b_z = (double*)(b_z1->x);
+
+  cholmod_factor* Lz = cholmodMatrix.Lz;
+  cholmod_factor* Lx = cholmodMatrix.Lx;
+
+  int index2 = 0; // dummy initialize
+  while (1) {
+    pthread_mutex_lock(&job_mutex);
+    index2 = Column_Number++;
+    pthread_mutex_unlock(&job_mutex);
+    if (index2 >= SDP_nStartIndex2l) {
+      break;
+    }
+    const int start_iter = SDP_startIndex2l[index2];
+    const int end_iter   = SDP_startIndex2l[index2+1];
+    int j  = SDP_constraint2l[start_iter];
+    int jb = SDP_blockIndex2l[start_iter];
+    CompMatrix& Aj = A[j].SDP_sp_block[jb];
+
+    const int nCol = Aj.nzColumn;
+    double gj1 = 0.0;
+    double gj2 = 0.0;
+    for (int k_index = 0; k_index < nCol; ++k_index) {
+      const int k = Aj.column_index[k_index];
+      for (int index1=0;index1 < nDim; ++index1) {
+	b_z[index1] = 0.0;
+      }
+      const int i_start = Aj.column_start[k_index];
+      const int i_end   = Aj.column_start[k_index+1];
+      for (int i_index = i_start; i_index < i_end; ++i_index) {
+	const int i2 = Aj.row_index[i_index];
+	b_z[i2] = Aj.ele[i_index];
+      }
+      cholmod_dense* b2   = cholmod_solve(CHOLMOD_P, Lz, b_z1, &common);
+      cholmod_dense* x_z2 = cholmod_solve(CHOLMOD_LDLt, Lz, b2, &common);
+      cholmod_dense* x_z1 = cholmod_solve(CHOLMOD_Pt, Lz, x_z2, &common);
+      double* x_z = (double*)(x_z1->x);
+      for (int index1=0;index1 < nDim; ++index1) {
+	b_x[index1] = 0.0;
+      }
+      b_x[k] = 1.0;
+      
+      cholmod_dense* b_x2 = cholmod_solve(CHOLMOD_P, Lx, b_x1,  &common);
+      cholmod_dense* x_x3 = cholmod_solve(CHOLMOD_L, Lx, b_x2, &common);
+      cholmod_dense* x_x2 = cholmod_solve(CHOLMOD_Lt,Lx, x_x3, &common);
+      cholmod_dense* x_x1 = cholmod_solve(CHOLMOD_Pt, Lx, x_x2, &common);
+      double* x_x = (double*)(x_x1->x);
+
+#if 0
+      rMessage(" x_x = ");
+      CholmodMatrix::display_dense(cholmodMatrix.x_x);
+      rMessage(" x_z = ");
+      CholmodMatrix::display_dense(cholmodMatrix.x_z);
+      rMessage(" rD = ");
+      CholmodMatrix::display_sparse(cholmodMatrix.rD);
+#endif
+	
+      double gadd = 0.0;
+      if (phase.value == SolveInfo:: pFEAS
+	  || phase.value == SolveInfo::noINFO) {
+	Lal::getInnerProduct(gadd, cholmodMatrix.rD, x_x, x_z);
+	// rMessage("gj1 = " << gadd << " : gj2 = " << x_z[k] << " : k = " << k);
+      }
+      gj1 += gadd;
+      gj2 += x_z[k];
+
+      for (int iter = start_iter; iter < end_iter; ++iter) {
+	int i = SDP_constraint1l[iter];
+	int ib = SDP_blockIndex1l[iter];
+	CompMatrix& Ai = A[i].SDP_sp_block[ib];
+	double Badd = 0.0;
+	Lal::getInnerProduct(Badd, Ai, x_x, x_z);
+	sparse_bMat.sp_ele[SDP_location_sparse_bMatl[iter]] += Badd;
+      }
+      cholmod_free_dense(&b2,&common);
+      cholmod_free_dense(&x_z2,&common);
+      cholmod_free_dense(&b_x2,&common);
+      cholmod_free_dense(&x_x3,&common);
+      cholmod_free_dense(&x_x2,&common);      
+      cholmod_free_dense(&x_x1,&common);
+      cholmod_free_dense(&x_z1,&common);
+    } // end of 'for (int k_index)'
+    gVec.ele[j] += (gj1 - target_mu * gj2);
+  } // end of 'for (int index2, aka j)'
+  cholmod_free_dense(&b_x1,&common);
+  cholmod_free_dense(&b_z1,&common);
+  return NULL;
+}
+
+
+
+void Newton::Make_bMatgVec(InputData& inputData,
+			   Solutions& currentPt,
+			   Residuals& currentRes,
+			   AverageComplementarity& mu,
+			   DirectionParameter& beta,
+			   Phase& phase,
+			   ComputeTime& com)
+{
+  TimeStart(START3);
+  gVec.copyFrom(inputData.b);
+  if (bMat_type == SPARSE){
+    // set sparse_bMat zero
+    sdpa_dset(sparse_bMat.NonZeroCount, 0.0, sparse_bMat.sp_ele, 1);
+    #if 0
+    compute_bMatgVec_sparse(inputData, currentPt, currentRes, mu, beta,
+    			    phase, com);
+    #else
+    compute_bMatgVec_sparse_threads(inputData, currentPt, currentRes, mu, beta,
+    			    phase, com);
+    #endif
+  } else {
+    bMat.setZero();
+    #if 0
+    compute_bMatgVec_dense(inputData, currentPt, currentRes, mu, beta,
+			   phase, com);
+    #else
+    compute_bMatgVec_dense_threads(inputData, currentPt, currentRes, mu, beta,
+			   phase, com);
+    #endif
+  }
+  
+  #if 0
+  // display_index();
+  rMessage("bMat =  ");
+  if (bMat_type == DENSE) {
+    bMat.display();
+  }
+  else {
+    sparse_bMat.display();
+  }
+  rMessage("gVec = ");
+  gVec.display();
+  #endif
+  TimeEnd(END3);
+  com.makebMatgVec += TimeCal(START3,END3);
+}
+
+bool Newton::compute_DyVec(Newton::WHICH_DIRECTION direction,
+			   int m,
+			   InputData& inputData,
+			   Chordal& chordal,
+			   Solutions& currentPt,
+			   ComputeTime& com,
+			   FILE* Display, FILE* fpOut)
+{
+  if (direction == PREDICTOR) {
+    TimeStart(START3_2);
+    
+    if (bMat_type == SPARSE){
+      bool ret = chordal.factorizeSchur(m, diagonalIndex, Display, fpOut);
+      if (ret == SDPA_FAILURE) {
+	return SDPA_FAILURE;
+      }
+    } else {
+      bool ret = Lal::choleskyFactorWithAdjust(bMat);
+      if (ret == SDPA_FAILURE) {
+	return SDPA_FAILURE;
+      }
+    }
+    // rMessage("Cholesky of bMat =  ");
+    // bMat.display();
+    // sparse_bMat.display();
+    TimeEnd(END3_2);
+    com.choleskybMat += TimeCal(START3_2,END3_2);
+  }
+  // bMat is already cholesky factorized.
+
+
+  Vector& DyVec = currentPt.cholmodSpace.dyVec;
+  
+  TimeStart(START4);
+  if (bMat_type == SPARSE){
+    DyVec.copyFrom(gVec);
+    chordal.solveSchur(DyVec);
+  } else {
+    Lal::let(DyVec,'=',bMat,'/',gVec);
+  }
+  TimeEnd(END4);
+  com.solve += TimeCal(START4,END4);
+  // rMessage("DyVec =  ");
+  // DyVec.display();
+  return SDPA_SUCCESS;
+}
+
+void Newton::compute_DzMat(InputData& inputData,
+			   Solutions& currentPt,
+			   Residuals& currentRes,
+			   Phase& phase,
+			   ComputeTime& com)
+{
+  TimeStart(START_SUMDZ);
+  CholmodSpace& cholmodSpace = currentPt.cholmodSpace;
+  CompSpace* A = inputData.A;
+
+  for (int l=0; l<cholmodSpace.LP_nBlock; ++l) {
+    cholmodSpace.LP_dZ[l] = 0.0;
+  }
+  
+  for (int l=0; l<cholmodSpace.SDP_nBlock; ++l) {
+    cholmod_sparse* dZ = cholmodSpace.SDP_block[l].dZ;
+    const int length = dZ->nzmax;
+    for (int index1 = 0; index1 < length; ++index1) {
+      ((double*)(dZ->x))[index1] = 0.0;
+    }
+  }
+
+  const int m = cholmodSpace.dyVec.nDim;
+  for (int k=0; k<m; ++k) {
+    const double dyk = cholmodSpace.dyVec.ele[k];
+    for (int l_index=0; l_index < A[k].LP_sp_nBlock; ++l_index) {
+      const int l        = A[k].LP_sp_index[l_index];
+      const double value = A[k].LP_sp_block[l_index];
+      cholmodSpace.LP_dZ[l] -= dyk * value;
+    }
+    for (int l_index=0; l_index < A[k].SDP_sp_nBlock; ++l_index) {
+      const int l        = A[k].SDP_sp_index[l_index];
+      CompMatrix& Akl    = A[k].SDP_sp_block[l_index];
+      cholmod_sparse* dZ = cholmodSpace.SDP_block[l].dZ;
+      for (int j_index = 0; j_index<Akl.nzColumn; ++j_index) {
+	const int row_start = Akl.diag_index[j_index];
+	const int row_end   = Akl.column_start[j_index+1];
+	if (row_start == -1) {
+	  continue;
+	}
+	for (int i = row_start; i < row_end; ++i) {
+	  const int agg_index = Akl.agg_index[i];
+	  ((double*)(dZ->x))[agg_index] -= Akl.ele[i]*dyk;
+	}
+      }
+    }
+  } // end of 'for (int k=0; k<m; ++k)'
+      
+  if (phase.value == SolveInfo:: pFEAS
+      || phase.value == SolveInfo::noINFO) {
+    for (int l=0; l<cholmodSpace.LP_nBlock; ++l) {
+      cholmodSpace.LP_dZ[l] += cholmodSpace.LP_rD[l];
+    }
+    for (int l=0; l<cholmodSpace.SDP_nBlock; ++l) {
+      cholmod_sparse* dZ = cholmodSpace.SDP_block[l].dZ;
+      cholmod_sparse* rD = cholmodSpace.SDP_block[l].rD;
+      const int length = dZ->nzmax;
+      for (int index1 = 0; index1 < length; ++index1) {
+	((double*)(dZ->x))[index1] += ((double*)(rD->x))[index1];
+      }
+    }
+  }
+  TimeEnd(END_SUMDZ);
+  com.sumDz += TimeCal(START_SUMDZ,END_SUMDZ);
+}
+
+void Newton::compute_DxMat(Solutions& currentPt,
+			   AverageComplementarity& mu,
+			   DirectionParameter& beta,
+			   ComputeTime& com)
+{
+  TimeStart(START_DX);
+  CholmodSpace& cholmodSpace   = currentPt.cholmodSpace;
+  OrderingSpace& orderingSpace = currentPt.order;
+  const double target_mu = beta.value * mu.current;
+  for (int l=0; l<cholmodSpace.LP_nBlock; ++l) {
+    cholmodSpace.LP_dX[l] = target_mu * cholmodSpace.LP_invZ[l]
+      - cholmodSpace.LP_X[l]
+      - cholmodSpace.LP_X[l] * cholmodSpace.LP_dZ[l] * cholmodSpace.LP_invZ[l];
+  }
+  for (int l=0; l<cholmodSpace.SDP_nBlock; ++l) {
+    OrderingMatrix& order        = orderingSpace.SDP_block[l];
+    CholmodMatrix& cholmodMatrix = cholmodSpace.SDP_block[l];
+    CliqueMatrix& clique_xMat    = cholmodMatrix.clique_xMat;
+    CliqueMatrix& clique_dX      = cholmodMatrix.clique_dX;
+
+    for (int l2=0; l2<clique_xMat.nBlock; ++l2) {
+      Lal::multiply(clique_dX.ele[l2],clique_xMat.ele[l2],&DMONE);
+    }
+    
+    cholmod_factor* Lz = cholmodMatrix.Lz;
+    cholmod_factor* Lx = cholmodMatrix.Lx;
+    cholmod_sparse* dZ = cholmodMatrix.dZ;
+    cholmod_common& common = cholmodMatrix.common;
+    const int nDim = cholmodMatrix.nDim;
+
+    for (int k=0; k<nDim; ++k) {
+      cholmodMatrix.setB_Zzero();
+      double* b_z = (double*)(cholmodMatrix.b_z->x);
+      b_z[k] = 1.0;
+      cholmodMatrix.solveByZ();
+      double* x_z = (double*)(cholmodMatrix.x_z->x);
+      // x_z is now Zinv[*k]
+
+      double one[2] = {1.0, 0.0};
+      double zero[2] = {0.0, 0.0} ;
+      int transpose = 0; // 0 means no transpose
+      double* b_x = (double*)(cholmodMatrix.b_x->x);
+      cholmod_sdmult(dZ, transpose, one, zero, cholmodMatrix.x_z,
+		     cholmodMatrix.b_x, &common);
+      cholmodMatrix.solveByX();
+      double* x_x = (double*)(cholmodMatrix.x_x->x);
+
+      for (int i=0; i<nDim; ++i) {
+	x_x[i] = target_mu * x_z[i] + -x_x[i];
+      }
+
+      int dXtNonzeros = order.dXtNonzeros[k];
+      int* dXtIndex   = order.dXtIndex[k];
+      int* dXtClique  = order.dXtClique[k];
+      int* dXtBlock   = order.dXtBlock[k];
+      for (int index1 = 0; index1 < dXtNonzeros; ++index1) {
+	DenseMatrix& targetMatrix = clique_dX.ele[dXtClique[index1]];
+	targetMatrix.de_ele[dXtBlock[index1]] += x_x[dXtIndex[index1]];
+      }
+						    
+    } // end of 'for (int k=0; k<nDim; ++k)'
+  }  // end of 'for (int l=0; l<cholmodSpace.SDP_nBlock; ++l)'
+    
+  TimeEnd(END_DX);
+  TimeStart(START_SYMM);
+  for (int l=0; l<cholmodSpace.SDP_nBlock; ++l) {
+    CliqueMatrix& clique_dX = cholmodSpace.SDP_block[l].clique_dX;
+    for (int l2=0; l2<clique_dX.nBlock; ++l2) {
+      Lal::getSymmetrize(clique_dX.ele[l2]);
+    }
+  }
+  TimeEnd(END_SYMM);
+  com.makedX += TimeCal(START_DX,END_DX);
+  com.symmetriseDx += TimeCal(START_SYMM,END_SYMM);
+}
+
+
+void Newton::compute_DxMat_threads(Solutions& currentPt,
+			   AverageComplementarity& mu,
+			   DirectionParameter& beta,
+			   ComputeTime& com)
+{
+  TimeStart(START_DX);
+  CholmodSpace& cholmodSpace   = currentPt.cholmodSpace;
+  OrderingSpace& orderingSpace = currentPt.order;
+  const double target_mu = beta.value * mu.current;
+  for (int l=0; l<cholmodSpace.LP_nBlock; ++l) {
+    cholmodSpace.LP_dX[l] = target_mu * cholmodSpace.LP_invZ[l]
+      - cholmodSpace.LP_X[l]
+      - cholmodSpace.LP_X[l] * cholmodSpace.LP_dZ[l] * cholmodSpace.LP_invZ[l];
+  }
+
+  // To improve CHOLMOD performance, turn off BLAS parallel
+  blas_set_num_threads(1);
+  
+  int ret;
+  ret = pthread_mutex_init(&job_mutex, NULL);
+  if (ret  != 0) {
+    rError("pthread_mutex_init error");
+  }
+  
+  pthread_t* handle;
+  NewArray(handle, pthread_t, NUM_THREADS);
+  thread_DX_t* tDX;
+  NewArray(tDX, thread_DX_t, NUM_THREADS);
+  for (int thread_num = 0; thread_num < NUM_THREADS; ++thread_num) {
+    tDX[thread_num].target_mu   = target_mu;
+    tDX[thread_num].thread_num  = thread_num;
+  }
+  
+  for (int l=0; l<cholmodSpace.SDP_nBlock; ++l) {
+    OrderingMatrix& order        = orderingSpace.SDP_block[l];
+    CholmodMatrix& cholmodMatrix = cholmodSpace.SDP_block[l];
+    CliqueMatrix& clique_xMat    = cholmodMatrix.clique_xMat;
+    CliqueMatrix& clique_dX      = cholmodMatrix.clique_dX;
+
+    for (int l2=0; l2<clique_xMat.nBlock; ++l2) {
+      Lal::multiply(clique_dX.ele[l2],clique_xMat.ele[l2],&DMONE);
+    }
+    const int nDim = cholmodMatrix.nDim;
+    Column_NumberDx = 0;
+    for (int thread_num = 0; thread_num < NUM_THREADS; ++thread_num) {
+      tDX[thread_num].l = l;
+      tDX[thread_num].addr_cholmodMatrix = &cholmodMatrix;
+      tDX[thread_num].addr_order         = &order;
+      pthread_create(&handle[thread_num], NULL,
+		     compute_DxMat_threads_SDP,
+		     (void *)&tDX[thread_num]);
+    }
+
+    for (int thread_num = 0; thread_num < NUM_THREADS; ++thread_num) {
+      pthread_join(handle[thread_num], NULL);
+    }
+  }  // end of 'for (int l=0; l<cholmodSpace.SDP_nBlock; ++l)'
+  DeleteArray(handle);
+  DeleteArray(tDX);
+  ret = pthread_mutex_destroy(&job_mutex);
+  if (ret != 0) {
+    rError("pthread_mutex_destroy error in sdpa_newton.cpp");
+  }
+  blas_set_num_threads(NUM_THREADS);
+    
+  TimeEnd(END_DX);
+  TimeStart(START_SYMM);
+  for (int l=0; l<cholmodSpace.SDP_nBlock; ++l) {
+    CliqueMatrix& clique_dX = cholmodSpace.SDP_block[l].clique_dX;
+    for (int l2=0; l2<clique_dX.nBlock; ++l2) {
+      Lal::getSymmetrize(clique_dX.ele[l2]);
+    }
+  }
+  TimeEnd(END_SYMM);
+  com.makedX += TimeCal(START_DX,END_DX);
+  com.symmetriseDx += TimeCal(START_SYMM,END_SYMM);
+}
+
+
+void* Newton::compute_DxMat_threads_SDP(void* arg)
+{
+  thread_DX_t* tDX = (thread_DX_t*) arg;
+  const int       l             = tDX->l;
+  const int       thread_num    = tDX->thread_num;
+  const double    target_mu     = tDX->target_mu;
+  CholmodMatrix&  cholmodMatrix = *(tDX->addr_cholmodMatrix);
+  OrderingMatrix& order         = *(tDX->addr_order);
+
+  const int nDim = cholmodMatrix.nDim;
+  cholmod_factor* Lz = cholmodMatrix.Lz;
+  cholmod_factor* Lx = cholmodMatrix.Lx;
+  cholmod_sparse* dZ = cholmodMatrix.dZ;
+  cholmod_common& common = cholmodMatrix.common;
+  CliqueMatrix& clique_dX = cholmodMatrix.clique_dX;
+  
+  cholmod_dense* b_x1;
+  cholmod_dense* b_z1;
+  b_x1 = cholmod_allocate_dense(nDim,1,nDim,CHOLMOD_REAL,
+				&common);
+  b_z1 = cholmod_allocate_dense(nDim,1,nDim,CHOLMOD_REAL,
+				&common);
+  double* b_x = (double*)(b_x1->x);
+  double* b_z = (double*)(b_z1->x);
+
+  int k = 0; // dummy initialize
+  
+  while (1) {
+    pthread_mutex_lock(&job_mutex);
+    k = Column_NumberDx;
+    Column_NumberDx++;
+    pthread_mutex_unlock(&job_mutex);
+    if (k >= nDim) {
+      break;
+    }
+    for (int index1=0;index1 < nDim; ++index1) {
+      b_z[index1] = 0.0;
+    }
+    b_z[k] = 1.0;
+    cholmod_dense* b2   = cholmod_solve(CHOLMOD_P, Lz, b_z1, &common);
+    cholmod_dense* x_z2 = cholmod_solve(CHOLMOD_LDLt, Lz, b2, &common);
+    cholmod_dense* x_z1 = cholmod_solve(CHOLMOD_Pt, Lz, x_z2, &common);
+    double* x_z = (double*)(x_z1->x);
+
+    // x_z is now Zinv[*k]
+
+    double one[2] = {1.0, 0.0};
+    double zero[2] = {0.0, 0.0} ;
+    int transpose = 0; // 0 means no transpose
+    cholmod_sdmult(dZ, transpose, one, zero, x_z1,
+		   b_x1, &common);
+
+    cholmod_dense* b_x2 = cholmod_solve(CHOLMOD_P, Lx, b_x1,  &common);
+    cholmod_dense* x_x3 = cholmod_solve(CHOLMOD_L, Lx, b_x2, &common);
+    cholmod_dense* x_x2 = cholmod_solve(CHOLMOD_Lt,Lx, x_x3, &common);
+    cholmod_dense* x_x1 = cholmod_solve(CHOLMOD_Pt, Lx, x_x2, &common);
+    double* x_x = (double*)(x_x1->x);
+    
+
+    for (int i=0; i<nDim; ++i) {
+      x_x[i] = target_mu * x_z[i] + -x_x[i];
+    }
+
+    int dXtNonzeros = order.dXtNonzeros[k];
+    int* dXtIndex   = order.dXtIndex[k];
+    int* dXtClique  = order.dXtClique[k];
+    int* dXtBlock   = order.dXtBlock[k];
+    for (int index1 = 0; index1 < dXtNonzeros; ++index1) {
+      DenseMatrix& targetMatrix = clique_dX.ele[dXtClique[index1]];
+      targetMatrix.de_ele[dXtBlock[index1]] += x_x[dXtIndex[index1]];
+    }
+						    
+    cholmod_free_dense(&b2,&common);
+    cholmod_free_dense(&x_z2,&common);
+    cholmod_free_dense(&x_z1,&common);
+    cholmod_free_dense(&b_x2,&common);
+    cholmod_free_dense(&x_x3,&common);
+    cholmod_free_dense(&x_x2,&common);
+    cholmod_free_dense(&x_x1,&common);
+  } // end of 'for (int k=0; k<nDim; ++k)'
+  cholmod_free_dense(&b_x1,&common);
+  cholmod_free_dense(&b_z1,&common);
+  return NULL;
+}
+
+
+
+bool Newton::Mehrotra(Newton::WHICH_DIRECTION direction,
+		      int m,
+		      InputData& inputData,
+		      Chordal& chordal,
+		      Solutions& currentPt,
+		      Residuals& currentRes,
+		      AverageComplementarity& mu,
+		      DirectionParameter& beta,
+		      Switch& reduction,
+		      Phase& phase,
+		      ComputeTime& com,
+		      FILE* Display, FILE* fpOut)
+{
+  // rMessage("BEFORE CHOLESKY");  currentPt.display();
+  
+  if (direction == PREDICTOR) {
+    if (currentPt.cholmodSpace.getCholesky(currentPt.order) == SDPA_FAILURE) {
+      return SDPA_FAILURE;
+    }
+  }
+  // rMessage("AFTER CHOLESKY");  currentPt.display();
+
+  if (direction == PREDICTOR) {
+    Make_bMatgVec(inputData, currentPt, currentRes, mu,
+		  beta, phase, com);
+  }
+
+  bool ret = compute_DyVec(direction,
+			   m, inputData, chordal,
+			   currentPt, com, Display, fpOut);
+  if (ret == SDPA_FAILURE) {
+    return SDPA_FAILURE;
+  }
+  // rMessage("dy = ");
+  // currentPt.cholmodSpace.dyVec.display();
+  compute_DzMat(inputData, currentPt, currentRes, phase, com);
+  TimeStart(START5);
+  #if 0
+  compute_DxMat(currentPt, mu, beta, com);
+  #else
+  compute_DxMat_threads(currentPt, mu, beta, com);
+  #endif
+  TimeEnd(END5);
+  com.makedXdZ += TimeCal(START5,END5);
+  // rMessage("After Schur currentPt"); currentPt.display();
+  return true;
+}
+
+void Newton::checkDirection(int m, InputData& inputData,
+			    Solutions& currentPt,
+			    Residuals& currentRes,
+			    AverageComplementarity& mu,
+			    DirectionParameter& beta,
+			    Switch& reduction,
+			    Phase& phase,
+			    ComputeTime& com,
+			    FILE* Display, FILE* fpOut)
+{  // Check the direction
+  rMessage("Checking Directions");
+  CholmodSpace&  cholmodSpace = currentPt.cholmodSpace;
+  OrderingSpace& order        = currentPt.order;
+  CompSpace& C = inputData.C;
+  CompSpace* A = inputData.A;
+  Vector&    b = inputData.b;
+
+  // Check [A \bullet (X+dX) == b]
+  double* rp;
+  NewArray(rp, double, m);
+  for (int k=0; k<m; ++k) {
+    double ip = 0.0; // dummy initialize
+    cholmodSpace.getInnerProductAX(ip, A[k], order);
+    double dip = 0.0; // dummy initialize
+    cholmodSpace.getInnerProductAdX(dip, A[k], order);
+    rp[k] = b.ele[k] - ip - dip;
+  }
+  double sum = 0.0;
+  for (int k=0; k<m; ++k) {
+    sum += rp[k]*rp[k];
+  }
+  double norm_rp = sqrt(sum);
+  DeleteArray(rp);
+  // printf("norm_dp = %e\n", norm_dp);
+
+  // Check [I == clique_invCholeskyX^T * clique_xMat * clique_invCholeskyX]
+  sum = 0.0;
+  for (int l=0; l<cholmodSpace.LP_nBlock; ++l) {
+    double diff = cholmodSpace.LP_X[l]*cholmodSpace.LP_invX[l] - 1.0;
+    sum += diff*diff;
+  }
+  for (int l=0; l<cholmodSpace.SDP_nBlock; ++l) {
+    CholmodMatrix& cholmodMatrix = cholmodSpace.SDP_block[l];
+    for (int l2=0; l2<cholmodMatrix.clique_xMat.nBlock; ++l2) {
+      DenseMatrix& xMat = cholmodMatrix.clique_xMat.ele[l2];
+      DenseMatrix& invCholeskyX = cholmodMatrix.clique_invCholeskyX.ele[l2];
+      DenseMatrix multi1;
+      multi1.initialize(xMat.nRow, xMat.nCol);
+      Lal::tran_multiply(multi1,invCholeskyX,xMat);
+      DenseMatrix multi2;
+      multi2.initialize(xMat.nRow, xMat.nCol);
+      Lal::multiply(multi2, multi1, invCholeskyX);
+      #if 0
+      rMessage("xMat         = "); xMat.display();
+      rMessage("invCholeskyX = "); invCholeskyX.display();
+      rMessage("multi1       = "); multi1.display();
+      rMessage("multi2       = "); multi2.display();
+      #endif
+      double* ele = multi2.de_ele;
+      for (int i=0; i<xMat.nRow; ++i) {
+	ele[i+xMat.nRow*i] -= 1.0;
+      }
+      for (int i=0; i<xMat.nRow*xMat.nCol; ++i) {
+	sum += ele[i]*ele[i];
+      }
+    }
+  }
+  double norm_invX = sqrt(sum);
+  // printf("norm_invX = %e\n", norm_invX);
+  
+  // Check [C - A^(y+dy) - (Z+dZ) == O]
+  sum = 0.0;
+  double* rDl;
+  NewArray(rDl, double, cholmodSpace.LP_nBlock);
+  for (int l=0; l < cholmodSpace.LP_nBlock; ++l) {
+    rDl[l] = 0.0;
+  }
+  
+  for (int l_index = 0; l_index < C.LP_sp_nBlock; ++l_index) {
+    const int l        = C.LP_sp_index[l_index];
+    const double value = C.LP_sp_block[l_index];
+    rDl[l] = value;
+  }
+  for (int k=0; k<m; ++k) {
+    const double yk = cholmodSpace.yVec.ele[k]; 
+    const double dyk = cholmodSpace.dyVec.ele[k]; 
+    for (int l_index = 0; l_index < A[k].LP_sp_nBlock; ++l_index) {
+      const int l        = A[k].LP_sp_index[l_index];
+      const double value = A[k].LP_sp_block[l_index];
+      rDl[l] -= value*(yk+dyk);
+    }
+  }
+  for (int l=0; l<cholmodSpace.LP_nBlock; ++l) {
+    rDl[l] -= (cholmodSpace.LP_Z[l] + cholmodSpace.LP_dZ[l]);
+  }
+  for (int l=0; l<cholmodSpace.LP_nBlock; ++l) {
+    sum += rDl[l]*rDl[l];
+  }
+
+
+  cholmod_sparse** rDs;
+  NewArray(rDs, cholmod_sparse*, cholmodSpace.SDP_nBlock);
+  for (int l=0; l<cholmodSpace.SDP_nBlock; ++l) {
+    CholmodMatrix& cholmodMatrix = cholmodSpace.SDP_block[l];
+    cholmod_sparse* Z = cholmodMatrix.Z;
+    cholmod_sparse* dZ = cholmodMatrix.dZ;
+    rDs[l] = cholmod_copy_sparse(Z, &cholmodMatrix.common);
+    const int length = Z->nzmax;
+    for (int index1 = 0; index1 < length; ++index1) {
+      ((double*)(rDs[l]->x))[index1] = -((double*)(Z->x))[index1] -((double*)(dZ->x))[index1];
+    }
+  }
+  for (int l_index=0; l_index < C.SDP_sp_nBlock; ++l_index) {
+    const int    l = C.SDP_sp_index[l_index];
+    CompMatrix& Cl = C.SDP_sp_block[l_index];
+    cholmod_sparse* rD = rDs[l];
+    for (int j_index = 0; j_index<Cl.nzColumn; ++j_index) {
+      const int row_start = Cl.diag_index[j_index];
+      const int row_end   = Cl.column_start[j_index+1];
+      if (row_start == -1) {
+	continue;
+      }
+      for (int i = row_start; i < row_end; ++i) {
+	int agg_index = Cl.agg_index[i];
+	((double*)(rD->x))[agg_index] += Cl.ele[i];
+      }
+    }
+  }
+  for (int k=0; k<m; ++k) {
+    const double yk = cholmodSpace.yVec.ele[k]; 
+    const double dyk = cholmodSpace.dyVec.ele[k]; 
+    for (int l_index=0; l_index < A[k].SDP_sp_nBlock; ++l_index) {
+      const int     l = A[k].SDP_sp_index[l_index];
+      CompMatrix& Akl = A[k].SDP_sp_block[l_index];
+      cholmod_sparse* rD = rDs[l];
+      for (int j_index = 0; j_index<Akl.nzColumn; ++j_index) {
+	const int row_start = Akl.diag_index[j_index];
+	const int row_end   = Akl.column_start[j_index+1];
+	if (row_start == -1) {
+	  continue;
+	}
+	for (int i = row_start; i < row_end; ++i) {
+	  int agg_index = Akl.agg_index[i];
+	  ((double*)(rD->x))[agg_index] -= Akl.ele[i]*(yk+dyk);
+	}
+      }
+    }
+  }
+  for (int l=0; l<cholmodSpace.SDP_nBlock; ++l) {
+    const int length = rDs[l]->nzmax;
+    for (int index1 = 0; index1 < length; ++index1) {
+      double value = ((double*)(rDs[l]->x))[index1];
+      sum += value*value;
+    }
+  }
+  for (int l=0; l<cholmodSpace.SDP_nBlock; ++l) {
+    cholmod_free_sparse(&rDs[l], &cholmodSpace.SDP_block[l].common);
+  }
+  DeleteArray(rDs);
+  double norm_rD = sqrt(sum);
+  // printf("norm_rD = %e\n", norm_rD);
+
+  const double target_mu = beta.value * mu.current;
+
+  // Check [(X+dX)*(dZ+Z) - mu*I == O]
+  sum = 0.0;
+  double min_X = 1.0e+50;
+  double min_Z = 1.0e+50;
+
+  for (int l=0; l<cholmodSpace.LP_nBlock; ++l) {
+    double  X = cholmodSpace.LP_X[l];
+    double dX = cholmodSpace.LP_dX[l];
+    double  Z = cholmodSpace.LP_Z[l];
+    double dZ = cholmodSpace.LP_dZ[l];
+    double diff = (X+dX)*(dZ+Z) - target_mu * 1;
+    // rMessage("diff = " << diff);
+    sum += diff*diff;
+    if (X < min_X) {
+      min_X = X;
+    }
+    if (Z < min_Z) {
+      min_Z = Z;
+    }
+  }
+
+  for (int l=0; l<cholmodSpace.SDP_nBlock; ++l) {
+    CholmodMatrix& cholmodMatrix = cholmodSpace.SDP_block[l];
+    int nDim = cholmodMatrix.nDim;
+    cholmod_sparse*  Z = cholmodMatrix.Z;
+    cholmod_sparse* dZ = cholmodMatrix.dZ;
+    DenseMatrix D_Z;
+    DenseMatrix D_ZdZ;
+    D_Z.initialize(nDim, nDim);
+    D_Z.setZero();
+    D_ZdZ.initialize(nDim, nDim);
+    D_ZdZ.setZero();
+
+    DenseMatrix D_Result;
+    D_Result.initialize(nDim, nDim);
+    D_Result.setZero();
+    
+    const int ncol = (int) Z->ncol;
+    for (int j=0; j < ncol; ++j) {
+      const int start_row = ((int*)Z->p)[j];
+      const int end_row   = ((int*)Z->p)[j+1];
+      for (int i_index = start_row; i_index < end_row; ++i_index) {
+	const int    i     = ((   int*)Z->i)[i_index];
+	const double value = ((double*)Z->x)[i_index];
+	const double dvalue = ((double*)dZ->x)[i_index];
+	D_Z.de_ele[i+j*nDim] = value;
+	D_Z.de_ele[j+i*nDim] = value;
+	D_ZdZ.de_ele[i+j*nDim] = value+dvalue;
+	D_ZdZ.de_ele[j+i*nDim] = value+dvalue;
+      }
+    }
+  }
+  rMessage("This routine is only check");
+  rMessage("This routine is not implemented fully");
+  rMessage("The computation of R is not implemented fully");
+  double norm_R = sqrt(sum);
+
+  rMessage("inv(X)*X = inv(Z)*Z = I?");
+
+  sum = 0.0;
+  for (int k=0; k<m; ++k) {
+    const double yk = cholmodSpace.yVec.ele[k];
+    sum += yk*yk;
+  }
+  double norm_y = sqrt(sum);
+  
+  sum = 0.0;
+  for (int k=0; k<m; ++k) {
+    const double dyk = cholmodSpace.dyVec.ele[k];
+    sum += dyk*dyk;
+  }
+  double norm_dy = sqrt(sum);
+
+  printf("norm :: rp=%.2e, invX=%.2e, rD=%.2e, R=%.2e\n",
+	 norm_rp, norm_invX, norm_rD, norm_R);
+  // printf("min_X = %.2e, min_Z = %.2e\n", min_X, min_Z);
+  // printf("norm :: y=%.2e, dy=%.2e\n", norm_y, norm_dy);
+}
+
+  
+
+void Newton::display(FILE* fpout)
+{
+  if (fpout == NULL) {
+    return;
+  }
+
+  rMessage("This function is not implemented in SDPA-C");
+  #if 0
+  fprintf(fpout,"rNewton.DxMat = \n");
+  DxMat.display(fpout);
+  fprintf(fpout,"rNewton.DyVec = \n");
+  DyVec.display(fpout);
+  fprintf(fpout,"rNewton.DzMat = \n");
+  DzMat.display(fpout);
+  #endif
+}
+
+void Newton::display_index(FILE* fpout)
+{
+  if (fpout == NULL) {
+    return;
+  }
+  printf("display_index: %d %d\n",SDP_nBlock,LP_nBlock);
+
+  for (int l=0; l<SDP_nBlock; l++){
+    printf("SDP:%dth block\n",l);
+    for (int k=0; k<SDP_number[l]; k++){
+      printf("SDP(i=%d,ib=%d; j=%d,jb=%d) for target = %d\n",
+	     SDP_constraint1[l][k],SDP_blockIndex1[l][k],
+	     SDP_constraint2[l][k],SDP_blockIndex2[l][k], 
+	     SDP_location_sparse_bMat[l][k]);
+    }
+  }
+
+  for (int l=0; l<LP_nBlock; l++){
+    printf("LP:%dth block\n",l);
+    for (int k=0; k<LP_number[l]; k++){
+      printf("LP(i=%d,ib=%d; j=%d,jb=%d) for target = %d\n",
+	     LP_constraint1[l][k],LP_blockIndex1[l][k],
+	     LP_constraint2[l][k],LP_blockIndex2[l][k], 
+	     LP_location_sparse_bMat[l][k]);
+    }
+
+  }
+
+}
+
+void Newton::display_sparse_bMat(FILE* fpout)
+{
+  if (fpout == NULL) {
+    return;
+  }
+  fprintf(fpout,"{\n");
+  for (int index=0; index<sparse_bMat.NonZeroCount; ++index) {
+    int i        = sparse_bMat.row_index[index];
+    int j        = sparse_bMat.column_index[index];
+    double value = sparse_bMat.sp_ele[index];
+    fprintf(fpout,"val[%d,%d] = %e\n", i,j,value);
+  }
+  fprintf(fpout,"}\n");
+}
+
+} // end of namespace 'sdpa'
+