## Based on "Multi-Threading and One-Sided Communication in Parallel LU Factorization" ## http://citeseerx.ist.psu.edu/viewdoc/summary?doi=10.1.1.138.4361&rank=7 function hpl_seq(A::Matrix, b::Vector) blocksize = 5 n = size(A,1) A = [A b] B_rows = linspace(0, n, div(n,blocksize)+1) B_rows[end] = n B_cols = [B_rows, [n+1]] nB = length(B_rows) depend = zeros(Bool, nB, nB) # In parallel, depend needs to be able to hold futures ## Small matrix case if nB <= 1 x = A[1:n, 1:n] \ A[:,n+1] return x end ## Add a ghost row of dependencies to boostrap the computation for j=1:nB; depend[1,j] = true; end for i=1:(nB-1) ## Threads for panel factorizations I = (B_rows[i]+1):B_rows[i+1] #(depend[i+1,i], panel_p) = spawn(panel_factor_seq, I, depend[i,i]) (depend[i+1,i], panel_p) = panel_factor_seq(A, I, depend[i,i]) ## Threads for trailing updates for j=(i+1):nB J = (B_cols[j]+1):B_cols[j+1] #depend[i+1,j] = spawn(trailing_update_seq, I, J, panel_p, depend[i+1,i],depend[i,j]) depend[i+1,j] = trailing_update_seq(A, I, J, panel_p, depend[i+1,i],depend[i,j]) end end ## Completion of the last diagonal block signals termination #wait(depend[nB, nB]) ## Solve the triangular system x = triu(A[1:n,1:n]) \ A[:,n+1] return x end ## hpl() ### Panel factorization ### function panel_factor_seq(A, I, col_dep) n = size (A, 1) ## Enforce dependencies #wait(col_dep) ## Factorize a panel K = I[1]:n panel_p = lufact!(sub(A, K, I))[:p] # Economy mode ## Panel permutation panel_p = K[panel_p] return (true, panel_p) end ## panel_factor_seq() ### Trailing update ### function trailing_update_seq(A, I, J, panel_p, row_dep, col_dep) n = size (A, 1) ## Enforce dependencies #wait(row_dep, col_dep) ## Apply permutation from pivoting K = (I[end]+1):n A[I[1]:n, J] = A[panel_p, J] ## Compute blocks of U L = tril(A[I,I],-1) + eye(length(I)) A[I, J] = L \ A[I, J] ## Trailing submatrix update if !isempty(K) A[K,J] = A[K,J] - A[K,I]*A[I,J] end return true end ## trailing_update_seq() # This version is written for a shared memory implementation. # The matrix A is local to the first Worker, which allocates work to other Workers # All updates to A are carried out by the first Worker. Thus A is not distributed hpl_par(A::Matrix, b::Vector) = hpl_par(A, b, max(1, div(max(size(A)),4)), true) hpl_par(A::Matrix, b::Vector, bsize::Integer) = hpl_par(A, b, bsize, true) function hpl_par(A::Matrix, b::Vector, blocksize::Integer, run_parallel::Bool) n = size(A,1) A = [A b] if blocksize < 1 throw(ArgumentError("hpl_par: invalid blocksize: $blocksize < 1")) end B_rows = linspace(0, n, div(n,blocksize)+1) B_rows[end] = n B_cols = [B_rows, [n+1]] nB = length(B_rows) depend = cell(nB, nB) ## Small matrix case if nB <= 1 x = A[1:n, 1:n] \ A[:,n+1] return x end ## Add a ghost row of dependencies to boostrap the computation for j=1:nB; depend[1,j] = true; end for i=2:nB, j=1:nB; depend[i,j] = false; end for i=1:(nB-1) #println("A=$A") ##### ## Threads for panel factorizations I = (B_rows[i]+1):B_rows[i+1] K = I[1]:n (A_KI, panel_p) = panel_factor_par(A[K,I], depend[i,i]) ## Write the factorized panel back to A A[K,I] = A_KI ## Panel permutation panel_p = K[panel_p] depend[i+1,i] = true ## Apply permutation from pivoting J = (B_cols[i+1]+1):B_cols[nB+1] A[K, J] = A[panel_p, J] ## Threads for trailing updates #L_II = tril(A[I,I], -1) + eye(length(I)) L_II = tril(sub(A,I,I), -1) + eye(length(I)) K = (I[length(I)]+1):n A_KI = A[K,I] for j=(i+1):nB J = (B_cols[j]+1):B_cols[j+1] ## Do the trailing update (Compute U, and DGEMM - all flops are here) if run_parallel A_IJ = A[I,J] #A_KI = A[K,I] A_KJ = A[K,J] depend[i+1,j] = @spawn trailing_update_par(L_II, A_IJ, A_KI, A_KJ, depend[i+1,i], depend[i,j]) else depend[i+1,j] = trailing_update_par(L_II, A[I,J], A[K,I], A[K,J], depend[i+1,i], depend[i,j]) end end # Wait for all trailing updates to complete, and write back to A for j=(i+1):nB J = (B_cols[j]+1):B_cols[j+1] if run_parallel (A_IJ, A_KJ) = fetch(depend[i+1,j]) else (A_IJ, A_KJ) = depend[i+1,j] end A[I,J] = A_IJ A[K,J] = A_KJ depend[i+1,j] = true end end ## Completion of the last diagonal block signals termination @assert depend[nB, nB] ## Solve the triangular system x = triu(A[1:n,1:n]) \ A[:,n+1] return x end ## hpl() ### Panel factorization ### function panel_factor_par(A_KI, col_dep) @assert col_dep ## Factorize a panel panel_p = lufact!(A_KI)[:p] # Economy mode return (A_KI, panel_p) end ## panel_factor_par() ### Trailing update ### function trailing_update_par(L_II, A_IJ, A_KI, A_KJ, row_dep, col_dep) @assert row_dep @assert col_dep ## Compute blocks of U A_IJ = L_II \ A_IJ ## Trailing submatrix update - All flops are here if !isempty(A_KJ) m, k = size(A_KI) n = size(A_IJ,2) blas_gemm('N','N',m,n,k,-1.0,A_KI,m,A_IJ,k,1.0,A_KJ,m) #A_KJ = A_KJ - A_KI*A_IJ end return (A_IJ, A_KJ) end ## trailing_update_par() ### using DArrays ### function hpl_par2(A::Matrix, b::Vector) n = size(A,1) A = [A b] C = distribute(A, 2) nB = length(C.pmap) ## case if only one processor if nB <= 1 x = A[1:n, 1:n] \ A[:,n+1] return x end depend = Array(RemoteRef, nB, nB) #pmap[i] is where block i's stuff is #block i is dist[i] to dist[i+1]-1 for i = 1:nB #println("C=$(convert(Array, C))") ##### ##panel factorization panel_p = remotecall_fetch(C.pmap[i], panel_factor_par2, C, i, n) ## Apply permutation from pivoting for j = (i+1):nB depend[i,j] = remotecall(C.pmap[j], permute, C, i, j, panel_p, n, false) end ## Special case for last column if i == nB depend[nB,nB] = remotecall(C.pmap[nB], permute, C, i, nB+1, panel_p, n, true) end ##Trailing updates (i == nB) ? (I = (C.dist[i]):n) : (I = (C.dist[i]):(C.dist[i+1]-1)) C_II = C[I,I] L_II = tril(C_II, -1) + eye(length(I)) K = (I[length(I)]+1):n if length(K) > 0 C_KI = C[K,I] else C_KI = zeros(0) end for j=(i+1):nB dep = depend[i,j] depend[j,i] = remotecall(C.pmap[j], trailing_update_par2, C, L_II, C_KI, i, j, n, false, dep) end ## Special case for last column if i == nB dep = depend[nB,nB] remotecall_fetch(C.pmap[nB], trailing_update_par2, C, L_II, C_KI, i, nB+1, n, true, dep) else #enforce dependencies for nonspecial case for j=(i+1):nB wait(depend[j,i]) end end end A = convert(Array, C) x = triu(A[1:n,1:n]) \ A[:,n+1] end ## hpl_par2() function panel_factor_par2(C, i, n) (C.dist[i+1] == n+2) ? (I = (C.dist[i]):n) : (I = (C.dist[i]):(C.dist[i+1]-1)) K = I[1]:n C_KI = C[K,I] #(C_KI, panel_p) = lu!(C_KI) #economy mode panel_p = lu!(C_KI)[2] C[K,I] = C_KI return panel_p end ##panel_factor_par2() function permute(C, i, j, panel_p, n, flag) if flag K = (C.dist[i]):n J = (n+1):(n+1) C_KJ = C[K,J] C_KJ = C_KJ[panel_p,:] C[K,J] = C_KJ else K = (C.dist[i]):n J = (C.dist[j]):(C.dist[j+1]-1) C_KJ = C[K,J] C_KJ = C_KJ[panel_p,:] C[K,J] = C_KJ end end ##permute() function trailing_update_par2(C, L_II, C_KI, i, j, n, flag, dep) if isa(dep, RemoteRef); wait(dep); end if flag #(C.dist[i+1] == n+2) ? (I = (C.dist[i]):n) : # (I = (C.dist[i]):(C.dist[i+1]-1)) I = C.dist[i]:n J = (n+1):(n+1) K = (I[length(I)]+1):n C_IJ = C[I,J] if length(K) > 0 C_KJ = C[K,J] else C_KJ = zeros(0) end ## Compute blocks of U C_IJ = L_II \ C_IJ C[I,J] = C_IJ else #(C.dist[i+1] == n+2) ? (I = (C.dist[i]):n) : # (I = (C.dist[i]):(C.dist[i+1]-1)) I = (C.dist[i]):(C.dist[i+1]-1) J = (C.dist[j]):(C.dist[j+1]-1) K = (I[length(I)]+1):n C_IJ = C[I,J] if length(K) > 0 C_KJ = C[K,J] else C_KJ = zeros(0) end ## Compute blocks of U C_IJ = L_II \ C_IJ C[I,J] = C_IJ ## Trailing submatrix update - All flops are here if !isempty(C_KJ) cm, ck = size(C_KI) cn = size(C_IJ,2) blas_gemm('N','N',cm,cn,ck,-1.0,C_KI,cm,C_IJ,ck,1.0,C_KJ,cm) #C_KJ = C_KJ - C_KI*C_IJ C[K,J] = C_KJ end end end ## trailing_update_par2() ## Test n*n matrix on np processors ## Prints 5 numbers that should be close to zero function test(n, np) A = rand(n,n); b = rand(n); X = (@elapsed x = A \ b); Y = (@elapsed y = hpl_par(A,b, max(1,div(n,np)))); Z = (@elapsed z = hpl_par2(A,b)); for i=1:(min(5,n)) print(z[i]-y[i], " ") end println() return (X,Y,Z) end ## test k times and collect average function test(n,np,k) sum1 = 0; sum2 = 0; sum3 = 0; for i = 1:k (X,Y,Z) = test(n,np) sum1 += X sum2 += Y sum3 += Z end return (sum1/k, sum2/k, sum3/k) end