Julia is a high-level, high-performance dynamic programming language for technical computing. This site is a non official series of examples of Julia, for more details see about.
This script prints a string identical to it's own source code, see here.
github.com/karbarcca/Rosetta-Julia/blob/master/src/Quine.jl: In Julia, $x
in a string literal interpolates the value of the variable into the string. $(expression)
evaluates the expression and interpolates the result into the string. Normally, the string value "hi\tworld"
would be inserted without quotation marks and with a literal tab
The repr
function returns a string value that contains quotation marks and in which the literal tab is replaced by the characters \t
. When the result of the repr
function is interpolated, the result is what you would type into your code to create that string literal.
x="println(\"x=\$(repr(x))\\n\$x\")" println("x=$(repr(x))\n$x")
import Base.Sort immutable BubbleSortAlg <: Sort.Algorithm end const BubbleSort = BubbleSortAlg() function Base.sort!(v::AbstractVector, lo::Int, hi::Int, ::BubbleSortAlg, o::Sort.Ordering) while true clean = true for i = lo:hi-1 if Sort.lt(o, v[i+1], v[i]) v[i+1], v[i] = v[i], v[i+1] clean = false end end clean && break end return v end
Some description here.
macro enum(T,syms...) blk = quote immutable $(esc(T)) n::Int32 $(esc(T))(n::Integer) = new(n) end Base.show(io::IO, x::$(esc(T))) = print(io, $syms[x.n+1]) Base.show(io::IO, x::Type{$(esc(T))}) = print(io, $(string("enum ", T, ' ', '(', join(syms, ", "), ')'))) end for (i,sym) in enumerate(syms) push!(blk.args, :(const $(esc(sym)) = $(esc(T))($(i-1)))) end push!(blk.args, :nothing) blk.head = :toplevel return blk end
module ModInts importall Base immutable ModInt{n} <: Integer k::Int ModInt(k) = new(mod(k,n)) end -{n}(a::ModInt{n}) = ModInt{n}(-a.k) +{n}(a::ModInt{n}, b::ModInt{n}) = ModInt{n}(a.k+b.k) -{n}(a::ModInt{n}, b::ModInt{n}) = ModInt{n}(a.k-b.k) *{n}(a::ModInt{n}, b::ModInt{n}) = ModInt{n}(a.k*b.k) convert{n}(::Type{ModInt{n}}, i::Int) = ModInt{n}(i) promote_rule{n}(::Type{ModInt{n}}, ::Type{Int}) = ModInt{n} show{n}(io::IO, k::ModInt{n}) = print(io, "$(k.k) mod $n") showcompact(io::IO, k::ModInt) = print(io, k.k) end # module
Example of the 8 Queens Puzzle.
addqueen(queens::Array{Vector{Int}}, queen::Vector{Int}) = push!(copy(queens), queen) hitsany(queen::Vector{Int}, queens::Array{Vector{Int}}) = any(map(x->hits(queen, x), queens)) hits(a::Array{Int}, b::Array{Int}) = any(a .== b) || abs(a-b)[1] == abs(a-b)[2] function solve(x, y, n, d=Array(Vector{Int}, 0)) if n == 0 return d end for px = 1:x for py = 1:y if !hitsany([px, py], d) s = solve(x, y, n-1, addqueen(d, [px, py])) s != nothing && return s end end end return nothing end
importall Base # figure 5.2 from principles of parallel programming, ported to julia. # sum a vector using a tree on top of local reductions. function sum(v::DArray) P = procs(v) nodeval = { RemoteRef(p) for p=P } answer = RemoteRef() np = numel(P) for i=0:np-1 @spawnat P[i+1] begin stride=1 tally = sum(localpart(v)) while stride < np if i%(2*stride) == 0 tally = tally + take(nodeval[i+stride]) stride = 2*stride else put(nodeval[i], tally) break end end if i==0 put(answer, tally) end end end fetch(answer) end function reduce(f, v::DArray) mapreduce(fetch, f, { @spawnat p reduce(f,localpart(v)) for p = procs(v) }) end # possibly-useful abstraction: type RefGroup refs::Array{RemoteRef,1} RefGroup(P) = new([ RemoteRef(p) for p=P ]) end getindex(r::RefGroup, i) = fetch(r.refs[i]) setindex!(r::RefGroup, v, i) = put(r.refs[i], v)
module Time export TimeDelta import Base.show, Base.+, Base.-, Base.convert, Base.promote_rule immutable TimeDelta{p} v::Int64 end const PREFIXES = [ "yocto", "zepto", "atto", "femto", "pico", "nano", "micro", "milli", "", "kilo", "mega", "giga", "tera", "peta", "exa", "zetta", "yotta", ] const ZERO_INDEX = 9 const MAX_INDEX = 17 function show{p}(io::IO, x::TimeDelta{p}) k = max(1,min(MAX_INDEX,fld(p,3)+ZERO_INDEX)) r = p-3(k-ZERO_INDEX) prefix = PREFIXES[k] if r == 0 s = x.v == 1 ? "" : "s" print(io, "$(x.v) $(prefix)second$s") elseif r > 0 print(io, "$(x.v*10^r) $(prefix)seconds") else print(io, "$(x.v/10^-r) $(prefix)seconds") end end convert{p,q}(::Type{TimeDelta{p}}, x::TimeDelta{q}) = TimeDelta{p}(p <= q ? x.v*10^(q-p) : div(x.v,10^(p-q))) promote_rule{p,q}(::Type{TimeDelta{p}}, ::Type{TimeDelta{q}}) = TimeDelta{min(p,q)} -{p}(x::TimeDelta{p}) = TimeDelta{p}(-x.v) +{p}(x::TimeDelta{p}, y::TimeDelta{p}) = TimeDelta{p}(x.v+y.v) -{p}(x::TimeDelta{p}, y::TimeDelta{p}) = TimeDelta{p}(x.v-y.v) +(x::TimeDelta, y::TimeDelta) = +(promote(x,y)...) -(x::TimeDelta, y::TimeDelta) = -(promote(x,y)...) end # module
ndgrid(v::AbstractVector) = copy(v) function ndgrid{T}(v1::AbstractVector{T}, v2::AbstractVector{T}) m, n = length(v1), length(v2) v1 = reshape(v1, m, 1) v2 = reshape(v2, 1, n) (repmat(v1, 1, n), repmat(v2, m, 1)) end function ndgrid_fill(a, v, s, snext) for j = 1:length(a) a[j] = v[div(rem(j-1, snext), s)+1] end end function ndgrid{T}(vs::AbstractVector{T}...) n = length(vs) sz = map(length, vs) out = ntuple(n, i->Array(T, sz)) s = 1 for i=1:n a = out[i]::Array v = vs[i] snext = s*size(a,i) ndgrid_fill(a, v, s, snext) s = snext end out end meshgrid(v::AbstractVector) = meshgrid(v, v) function meshgrid{T}(vx::AbstractVector{T}, vy::AbstractVector{T}) m, n = length(vy), length(vx) vx = reshape(vx, 1, n) vy = reshape(vy, m, 1) (repmat(vx, m, 1), repmat(vy, 1, n)) end function meshgrid{T}(vx::AbstractVector{T}, vy::AbstractVector{T}, vz::AbstractVector{T}) m, n, o = length(vy), length(vx), length(vz) vx = reshape(vx, 1, n, 1) vy = reshape(vy, m, 1, 1) vz = reshape(vz, 1, 1, o) om = ones(Int, m) on = ones(Int, n) oo = ones(Int, o) (vx[om, :, oo], vy[:, on, oo], vz[om, on, :]) end
using LRUExample TestLRU = LRUExample.UnboundedLRU{ASCIIString, ASCIIString}() TestBLRU = LRUExample.BoundedLRU{ASCIIString, ASCIIString}(1000) get_str(i) = ascii(vcat(map(x->[x>>4; x&0x0F], reinterpret(Uint8, [int32(i)]))...)) isbounded{L<:LRUExample.LRU}(::Type{L}) = any(map(n->n==:maxsize, L.names)) isbounded{L<:LRUExample.LRU}(l::L) = isbounded(L) nmax = int(logspace(2, 5, 4)) function lrutest() #println("LRU consistency tests") for lru in (TestLRU,TestBLRU) for n in nmax empty!(lru) #@printf(" %s, %d items\n", lru, n) #print(" Simple eviction: ") for i in 1:n str = get_str(i) lru[str] = str @assert lru.q[1].v == str if isbounded(lru) && length(lru) >= lru.maxsize tailstr = get_str(i-lru.maxsize+1) @assert lru.q[end].v == tailstr end end #println("pass") #print(" Lookup, random access: ") for i in 1:n str = get_str(rand(1:n)) if haskey(lru, str) # the bounded LRUs can have cache misses blah = lru[str] @assert lru.q[1].v == blah end end #println("pass") end empty!(lru) end end lrutest()
function add_method(gf, an, at, body) argexs = { Expr(symbol("::"), an[i], at[i]) for i=1:length(an) } def = quote let __F__=($gf) function __F__($(argexs...)) $body end end end eval(def) end macro staged(fdef) if !isa(fdef,Expr) || !is(fdef.head,:function) error("@staged: expected method definition") end fname = fdef.args[1].args[1] argspec = fdef.args[1].args[2:end] argnames = map(x->(isa(x,Expr) ? x.args[1] : x), argspec) qargnames = map(x->Expr(:quote,x), argnames) fbody = fdef.args[2] @gensym gengf argtypes expander genbody quote let ($gengf) global ($fname) # should be "outer" local ($expander) function ($expander)($(argnames...)) $fbody end ($gengf)() = 0 # should be initially empty GF function ($fname)($(argspec...)) ($argtypes) = typeof(tuple($(argnames...))) if !method_exists($gengf, $argtypes) ($genbody) = apply(($expander), ($argtypes)) add_method($gengf, {$(qargnames...)}, $argtypes, $genbody) end return ($gengf)($(argnames...)) end end end end # example @staged function nloops(dims::Tuple) names = map(x->gensym(), dims) ex = quote println([$(names...)]) end for i = 1:length(dims) ex = quote for $(names[i]) in dims[$i] $ex end end end ex end
function life_rule(old) m, n = size(old) new = similar(old, m-2, n-2) for j = 2:n-1 for i = 2:m-1 nc = +(old[i-1,j-1], old[i-1,j], old[i-1,j+1], old[i ,j-1], old[i ,j+1], old[i+1,j-1], old[i+1,j], old[i+1,j+1]) new[i-1,j-1] = (nc == 3 ? 1 : nc == 2 ? old[i,j] : 0) end end new end function life_step(d) DArray(size(d),procs(d)) do I # fetch neighborhood - toroidal boundaries top = mod(first(I[1])-2,size(d,1))+1 bot = mod( last(I[1]) ,size(d,1))+1 left = mod(first(I[2])-2,size(d,2))+1 right = mod( last(I[2]) ,size(d,2))+1 old = Array(Bool, length(I[1])+2, length(I[2])+2) @sync begin @async old[1 , 1 ] = d[top , left] # left side @async old[2:end-1, 1 ] = d[I[1], left] @async old[end , 1 ] = d[bot , left] @async old[1 , 2:end-1] = d[top , I[2]] @async old[2:end-1, 2:end-1] = d[I[1], I[2]] # middle @async old[end , 2:end-1] = d[bot , I[2]] @async old[1 , end ] = d[top , right] # right side @async old[2:end-1, end ] = d[I[1], right] @async old[end , end ] = d[bot , right] end life_rule(old) end end function plife(m, n) w = Window("parallel life", n, m) c = Canvas(w) pack(c) done = false c.mouse.button1press = (c,x,y)->(done=true) cr = getgc(c) grid = DArray(I->convert(Array{Bool,2},randbool(map(length,I))), (m, n), workers()) last = time(); f = 1 while !done @async begin img = convert(Array{Uint32,2},grid) .* 0x00ffffff set_source_surface(cr, CairoRGBSurface(img), 0, 0) paint(cr) reveal(c) end t = time() if t-last > 2 println("$(f/(t-last)) FPS") last = t; f = 0 end grid = life_step(grid) f += 1 sleep(0.01) end end
module Quaternions import Base: convert, promote_rule, show, real, imag, conj, abs, abs2, inv, +, -, /, * immutable Quaternion{T<:Real} <: Number q0::T q1::T q2::T q3::T end Quaternion(q0::Real,q1::Real,q2::Real,q3::Real) = Quaternion(promote(q0,q1,q2,q3)...) convert{T}(::Type{Quaternion{T}}, x::Real) = Quaternion(convert(T,x), zero(T), zero(T), zero(T)) convert{T}(::Type{Quaternion{T}}, z::Complex) = Quaternion(convert(T,real(z)), convert(T,imag(z)), zero(T), zero(T)) convert{T}(::Type{Quaternion{T}}, z::Quaternion) = Quaternion(convert(T,z.q0), convert(T,z.q1), convert(T,z.q2), convert(T,z.q3)) promote_rule{s,S}(::Type{MathConst{s}}, ::Type{Quaternion{S}}) = Quaternion{S} promote_rule{T,S}(::Type{Complex{T}}, ::Type{Quaternion{S}}) = Quaternion{promote_type(T,S)} promote_rule{S}(::Type{Bool}, ::Type{Quaternion{S}}) = Quaternion{S} promote_rule{T<:Real,S}(::Type{T}, ::Type{Quaternion{S}}) = Quaternion{promote_type(T,S)} promote_rule{T,S}(::Type{Quaternion{T}}, ::Type{Quaternion{S}}) = Quaternion{promote_type(T,S)} function show(io::IO, z::Quaternion) pm(x) = x < 0 ? " - $(-x)" : " + $x" print(io, z.q0, pm(z.q1), "i", pm(z.q2), "j", pm(z.q3), "k") end real(z::Quaternion) = z.q0 imag(z::Quaternion) = z.q1 conj(z::Quaternion) = Quaternion(z.q0, -z.q1, -z.q2, -z.q3) abs(z::Quaternion) = sqrt(z.q0*z.q0 + z.q1*z.q1 + z.q2*z.q2 + z.q3*z.q3) abs2(z::Quaternion) = z.q0*z.q0 + z.q1*z.q1 + z.q2*z.q2 + z.q3*z.q3 inv(z::Quaternion) = conj(z)/abs2(z) (-)(z::Quaternion) = Quaternion(-z.q0, -z.q1, -z.q2, -z.q3) (/)(z::Quaternion, x::Real) = Quaternion(z.q0/x, z.q1/x, z.q2/x, z.q3/x) (+)(z::Quaternion, w::Quaternion) = Quaternion(z.q0 + w.q0, z.q1 + w.q1, z.q2 + w.q2, z.q3 + w.q3) (-)(z::Quaternion, w::Quaternion) = Quaternion(z.q0 - w.q0, z.q1 - w.q1, z.q2 - w.q2, z.q3 - w.q3) (*)(z::Quaternion, w::Quaternion) = Quaternion(z.q0*w.q0 - z.q1*w.q1 - z.q2*w.q2 - z.q3*w.q3, z.q0*w.q1 + z.q1*w.q0 + z.q2*w.q3 - z.q3*w.q2, z.q0*w.q2 - z.q1*w.q3 + z.q2*w.q0 + z.q3*w.q1, z.q0*w.q3 + z.q1*w.q2 - z.q2*w.q1 + z.q3*w.q0) (/)(z::Quaternion, w::Quaternion) = z*inv(w) end # module
# wordcount.jl # # Implementation of parallelized "word-count" of a text, inspired by the # Hadoop WordCount example. Uses @spawn and fetch() to parallelize # the "map" task. Reduce is currently done single-threaded. # # To run in parallel on a string stored in variable `text`: # julia -p <N> # julia> require("<julia_dir>/examples/wordcount.jl") # julia> ...(define text)... # julia> counts=parallel_wordcount(text) # # Or to run on a group of files, writing results to an output file: # julia -p <N> # julia> require("<julia_dir>/examples/wordcount.jl") # julia> wordcount_files("/tmp/output.txt", "/tmp/input1.txt","/tmp/input2.txt",...) # "Map" function. # Takes a string. Returns a Dict with the number of times each word # appears in that string. function wordcount(text) words=split(text,[' ','\n','\t','-','.',',',':',';'],false) counts=Dict() for w = words counts[w]=get(counts,w,0)+1 end return counts end # "Reduce" function. # Takes a collection of Dicts in the format returned by wordcount() # Returns a Dict in which words that appear in multiple inputs # have their totals added together. function wcreduce(wcs) counts=Dict() for c in wcs, (k,v) in c counts[k] = get(counts,k,0)+v end return counts end # Splits input string into nprocs() equal-sized chunks (last one rounds up), # and @spawns wordcount() for each chunk to run in parallel. Then fetch()s # results and performs wcreduce(). function parallel_wordcount(text) lines=split(text,'\n',false) np=nprocs() unitsize=ceil(length(lines)/np) wcounts={} rrefs={} # spawn procs for i=1:np first=unitsize*(i-1)+1 last=unitsize*i if last>length(lines) last=length(lines) end subtext=join(lines[int(first):int(last)],"\n") push!(rrefs, @spawn wordcount( subtext ) ) end # fetch results while length(rrefs)>0 push!(wcounts,fetch(pop!(rrefs))) end # reduce count=wcreduce(wcounts) return count end # Takes the name of a result file, and a list of input file names. # Combines the contents of all files, then performs a parallel_wordcount # on the resulting string. Writes the results to result_file. function wordcount_files(result_file,inputs...) text = "" for file in inputs open(file) do f text *= readall(f) end end wc = parallel_wordcount(text) open(result_file,"w") do f for (k,v) in wc println(f, k,"=",v) end end end
module LRUExample # An LRU (Least Recently Used) cache is an associative data structure which # maintains its contents in an order such that the most recently used item # is at the beginning of the structure, and the least recently used at the end. # # This file specifies two types of LRU caches, both with and without a size # limit. BoundedLRU has a limit and evicts the LRU item if a new item is added # after that bound is reached. UnboundedLRU does not have a maximum size, but # can be used as a basis for more complex LRUs. # # LRUs should follow the interfaces for general collections, indexable # collections, and associative collections. # The standard implementation of an LRU backs a hash table with a doubly-linked # list for O(1) operations when reordering on access and eviction. The Julia # implementation instead backs the table with a Vector. For moderately-sized # collections, the difference in performance is small, and this implmentation # is simpler and easier to understand. import Base.isempty, Base.length, Base.sizeof import Base.start, Base.next, Base.done import Base.haskey, Base.get import Base.setindex!, Base.getindex, Base.delete!, Base.empty! import Base.show abstract LRU{K,V} <: Associative{K,V} # Default cache size const __MAXCACHE = 1024 type CacheItem{K,V} k::K v::V end type UnboundedLRU{K,V} <: LRU{K,V} ht::Dict q::Vector{CacheItem} UnboundedLRU() = new(Dict(), similar(Array(CacheItem,1), 0)) end UnboundedLRU() = UnboundedLRU{Any, Any}() type BoundedLRU{K,V} <: LRU{K,V} ht::Dict q::Vector{CacheItem} maxsize::Int BoundedLRU(m) = new(Dict(), similar(Array(CacheItem,1), 0), m) BoundedLRU() = BoundedLRU(__MAXCACHE) end BoundedLRU(m) = BoundedLRU{Any, Any}(m) BoundedLRU() = BoundedLRU{Any, Any}() ## collections ## isempty(lru::LRU) = isempty(lru.q) length(lru::LRU) = length(lru.q) haskey(lru::LRU, key) = haskey(lru.ht, key) ## associative ## # Should this check count as an access? haskey(lru::LRU, key) = haskey(lru.ht, key) get(lru::LRU, key, default) = haskey(lru, key) ? lru[key] : default function empty!(lru::LRU) empty!(lru.ht) empty!(lru.q) end show(io::IO, lru::UnboundedLRU) = print(io,"UnboundedLRU()") show(io::IO, lru::BoundedLRU) = print(io,"BoundedLRU($(lru.maxsize))") ## indexable ## # Method to do the second, slow lookup in the list with early return. function locate(q, x) for i = 1:length(q) if q[i] == x return i end end error("Item not found.") end function getindex(lru::LRU, key) item = lru.ht[key] idx = locate(lru.q, item) splice!(lru.q, idx) unshift!(lru.q, item) item.v end function setindex!(lru::LRU, v, key) if haskey(lru, key) item = lru.ht[key] idx = locate(lru.q, item) item.v = v splice!(lru.q, idx) else item = CacheItem(key, v) lru.ht[key] = item end unshift!(lru.q, item) end # Eviction function setindex!{V,K}(lru::BoundedLRU, v::V, key::K) invoke(setindex!, (LRU, V, K), lru, v, key) nrm = length(lru) - lru.maxsize for i in 1:nrm rm = pop!(lru.q) delete!(lru.ht, rm.k) end end ## associative ## function delete!(lru::LRU, key) item = lru.ht[key] idx = locate(lru.q, item) delete!(lru.ht, key) delete!(lru.q, idx) end end # module
module TypeTrees ## # Generate a text graphic of Julia modules type tree ## # The node type holds the type of the cuurent node and a dict of subtypes type TTNode strname::String typ::Type subtypes::Dict{String, TTNode} TTNode(sname::String, t::Type) = new(sname, t, Dict{String, TTNode}()) end # Add a node to a dict if not added function add_ttnode(subtypes::Dict{String, TTNode}, sname::String, tnode::TTNode) ret = get(subtypes, sname, nothing) (nothing == ret) && (ret = subtypes[sname] = tnode) ret end function add_ttnode(subtypes::Dict{String, TTNode}, sname::String, t::Type) ret = get(subtypes, sname, nothing) (nothing == ret) && (subtypes[sname] = ret = TTNode(sname, t)) ret end # Get a string name for the type typ_name(t::UnionType) = string(t) typ_name(t::TypeConstructor) = string(t) typ_name(t) = string(t.name) # Store a type and its type hierarchy chain # Recurse till we reach the top level type function store_type(sname::String, t::UnionType) suptype = UnionType tnode = TTNode(sname, t) # store unions under UnionType type subtypes = store_type(typ_name(suptype), suptype) add_ttnode(subtypes, sname, tnode) # unions are also in a sense related to the types of their components for suptype = t.types subtypes = store_type(typ_name(suptype), suptype) add_ttnode(subtypes, sname, tnode) end return tnode.subtypes end function store_type(sname::String, t::TypeConstructor) suptype = t.body subtypes = store_type(typ_name(suptype), suptype) tnode = add_ttnode(subtypes, sname, t) return tnode.subtypes end function store_type(sname::String, t::DataType) suptype = super(t) subtypes = (suptype != t) ? store_type(typ_name(suptype), suptype) : types_tree tnode = add_ttnode(subtypes, sname, t) return tnode.subtypes end function store_type(sname::String, t::Tuple) tnode = add_ttnode(types_tree, sname, t) return tnode.subtypes end function store_type(sname::String, t) suptype = super(t) subtypes = (suptype != t) ? store_type(typ_name(suptype), suptype) : types_tree tnode = add_ttnode(subtypes, sname, t) return tnode.subtypes end # examine all symbols in module and store those that are types function store_all_from(m::Module) for expr = names(m,true) try t = eval(m,expr) isa(t, Type) && store_type(string(expr), t) #catch ex # println("Error adding ", string(expr), m, " (", ex, ")") end end end type_props(typ) = "" type_props(typ::DataType) = string("<<", typ.abstract ? " abstract" : " concrete", typ.mutable ? " mutable" : " immutable", typ.pointerfree ? " pointerfree" : "", " size:", typ.size, " >>") function print_tree(subtypes::Dict{String, TTNode}, pfx::String="") for n in sort!([keys(subtypes)...]) v = subtypes[n] if(n == string(v.typ)) println(pfx, "+- ", n, " ", type_props(v.typ)) else println(pfx, "+- ", n, " = ", v.typ, " ", type_props(v.typ)) end print_tree(v.subtypes, pfx * ". ") end end # TODO: optionally take module names in command line # TODO: sort output # TODO: option to list subtrees of type tree, or other symbol types const types_tree = Dict{String, TTNode}() for m in (Base, Core, Main) store_all_from(m) end # print_tree(types_tree) end # module
## Based on "Multi-Threading and One-Sided Communication in Parallel LU Factorization" ## http://citeseerx.ist.psu.edu/viewdoc/summary?doi=10.1.1.138.4361&rank=7 function hpl_seq(A::Matrix, b::Vector) blocksize = 5 n = size(A,1) A = [A b] B_rows = linspace(0, n, div(n,blocksize)+1) B_rows[end] = n B_cols = [B_rows, [n+1]] nB = length(B_rows) depend = zeros(Bool, nB, nB) # In parallel, depend needs to be able to hold futures ## Small matrix case if nB <= 1 x = A[1:n, 1:n] \ A[:,n+1] return x end ## Add a ghost row of dependencies to boostrap the computation for j=1:nB; depend[1,j] = true; end for i=1:(nB-1) ## Threads for panel factorizations I = (B_rows[i]+1):B_rows[i+1] #(depend[i+1,i], panel_p) = spawn(panel_factor_seq, I, depend[i,i]) (depend[i+1,i], panel_p) = panel_factor_seq(A, I, depend[i,i]) ## Threads for trailing updates for j=(i+1):nB J = (B_cols[j]+1):B_cols[j+1] #depend[i+1,j] = spawn(trailing_update_seq, I, J, panel_p, depend[i+1,i],depend[i,j]) depend[i+1,j] = trailing_update_seq(A, I, J, panel_p, depend[i+1,i],depend[i,j]) end end ## Completion of the last diagonal block signals termination #wait(depend[nB, nB]) ## Solve the triangular system x = triu(A[1:n,1:n]) \ A[:,n+1] return x end ## hpl() ### Panel factorization ### function panel_factor_seq(A, I, col_dep) n = size (A, 1) ## Enforce dependencies #wait(col_dep) ## Factorize a panel K = I[1]:n panel_p = lufact!(sub(A, K, I))[:p] # Economy mode ## Panel permutation panel_p = K[panel_p] return (true, panel_p) end ## panel_factor_seq() ### Trailing update ### function trailing_update_seq(A, I, J, panel_p, row_dep, col_dep) n = size (A, 1) ## Enforce dependencies #wait(row_dep, col_dep) ## Apply permutation from pivoting K = (I[end]+1):n A[I[1]:n, J] = A[panel_p, J] ## Compute blocks of U L = tril(A[I,I],-1) + eye(length(I)) A[I, J] = L \ A[I, J] ## Trailing submatrix update if !isempty(K) A[K,J] = A[K,J] - A[K,I]*A[I,J] end return true end ## trailing_update_seq() # This version is written for a shared memory implementation. # The matrix A is local to the first Worker, which allocates work to other Workers # All updates to A are carried out by the first Worker. Thus A is not distributed hpl_par(A::Matrix, b::Vector) = hpl_par(A, b, max(1, div(max(size(A)),4)), true) hpl_par(A::Matrix, b::Vector, bsize::Integer) = hpl_par(A, b, bsize, true) function hpl_par(A::Matrix, b::Vector, blocksize::Integer, run_parallel::Bool) n = size(A,1) A = [A b] if blocksize < 1 throw(ArgumentError("hpl_par: invalid blocksize: $blocksize < 1")) end B_rows = linspace(0, n, div(n,blocksize)+1) B_rows[end] = n B_cols = [B_rows, [n+1]] nB = length(B_rows) depend = cell(nB, nB) ## Small matrix case if nB <= 1 x = A[1:n, 1:n] \ A[:,n+1] return x end ## Add a ghost row of dependencies to boostrap the computation for j=1:nB; depend[1,j] = true; end for i=2:nB, j=1:nB; depend[i,j] = false; end for i=1:(nB-1) #println("A=$A") ##### ## Threads for panel factorizations I = (B_rows[i]+1):B_rows[i+1] K = I[1]:n (A_KI, panel_p) = panel_factor_par(A[K,I], depend[i,i]) ## Write the factorized panel back to A A[K,I] = A_KI ## Panel permutation panel_p = K[panel_p] depend[i+1,i] = true ## Apply permutation from pivoting J = (B_cols[i+1]+1):B_cols[nB+1] A[K, J] = A[panel_p, J] ## Threads for trailing updates #L_II = tril(A[I,I], -1) + eye(length(I)) L_II = tril(sub(A,I,I), -1) + eye(length(I)) K = (I[length(I)]+1):n A_KI = A[K,I] for j=(i+1):nB J = (B_cols[j]+1):B_cols[j+1] ## Do the trailing update (Compute U, and DGEMM - all flops are here) if run_parallel A_IJ = A[I,J] #A_KI = A[K,I] A_KJ = A[K,J] depend[i+1,j] = @spawn trailing_update_par(L_II, A_IJ, A_KI, A_KJ, depend[i+1,i], depend[i,j]) else depend[i+1,j] = trailing_update_par(L_II, A[I,J], A[K,I], A[K,J], depend[i+1,i], depend[i,j]) end end # Wait for all trailing updates to complete, and write back to A for j=(i+1):nB J = (B_cols[j]+1):B_cols[j+1] if run_parallel (A_IJ, A_KJ) = fetch(depend[i+1,j]) else (A_IJ, A_KJ) = depend[i+1,j] end A[I,J] = A_IJ A[K,J] = A_KJ depend[i+1,j] = true end end ## Completion of the last diagonal block signals termination @assert depend[nB, nB] ## Solve the triangular system x = triu(A[1:n,1:n]) \ A[:,n+1] return x end ## hpl() ### Panel factorization ### function panel_factor_par(A_KI, col_dep) @assert col_dep ## Factorize a panel panel_p = lufact!(A_KI)[:p] # Economy mode return (A_KI, panel_p) end ## panel_factor_par() ### Trailing update ### function trailing_update_par(L_II, A_IJ, A_KI, A_KJ, row_dep, col_dep) @assert row_dep @assert col_dep ## Compute blocks of U A_IJ = L_II \ A_IJ ## Trailing submatrix update - All flops are here if !isempty(A_KJ) m, k = size(A_KI) n = size(A_IJ,2) blas_gemm('N','N',m,n,k,-1.0,A_KI,m,A_IJ,k,1.0,A_KJ,m) #A_KJ = A_KJ - A_KI*A_IJ end return (A_IJ, A_KJ) end ## trailing_update_par() ### using DArrays ### function hpl_par2(A::Matrix, b::Vector) n = size(A,1) A = [A b] C = distribute(A, 2) nB = length(C.pmap) ## case if only one processor if nB <= 1 x = A[1:n, 1:n] \ A[:,n+1] return x end depend = Array(RemoteRef, nB, nB) #pmap[i] is where block i's stuff is #block i is dist[i] to dist[i+1]-1 for i = 1:nB #println("C=$(convert(Array, C))") ##### ##panel factorization panel_p = remotecall_fetch(C.pmap[i], panel_factor_par2, C, i, n) ## Apply permutation from pivoting for j = (i+1):nB depend[i,j] = remotecall(C.pmap[j], permute, C, i, j, panel_p, n, false) end ## Special case for last column if i == nB depend[nB,nB] = remotecall(C.pmap[nB], permute, C, i, nB+1, panel_p, n, true) end ##Trailing updates (i == nB) ? (I = (C.dist[i]):n) : (I = (C.dist[i]):(C.dist[i+1]-1)) C_II = C[I,I] L_II = tril(C_II, -1) + eye(length(I)) K = (I[length(I)]+1):n if length(K) > 0 C_KI = C[K,I] else C_KI = zeros(0) end for j=(i+1):nB dep = depend[i,j] depend[j,i] = remotecall(C.pmap[j], trailing_update_par2, C, L_II, C_KI, i, j, n, false, dep) end ## Special case for last column if i == nB dep = depend[nB,nB] remotecall_fetch(C.pmap[nB], trailing_update_par2, C, L_II, C_KI, i, nB+1, n, true, dep) else #enforce dependencies for nonspecial case for j=(i+1):nB wait(depend[j,i]) end end end A = convert(Array, C) x = triu(A[1:n,1:n]) \ A[:,n+1] end ## hpl_par2() function panel_factor_par2(C, i, n) (C.dist[i+1] == n+2) ? (I = (C.dist[i]):n) : (I = (C.dist[i]):(C.dist[i+1]-1)) K = I[1]:n C_KI = C[K,I] #(C_KI, panel_p) = lu!(C_KI) #economy mode panel_p = lu!(C_KI)[2] C[K,I] = C_KI return panel_p end ##panel_factor_par2() function permute(C, i, j, panel_p, n, flag) if flag K = (C.dist[i]):n J = (n+1):(n+1) C_KJ = C[K,J] C_KJ = C_KJ[panel_p,:] C[K,J] = C_KJ else K = (C.dist[i]):n J = (C.dist[j]):(C.dist[j+1]-1) C_KJ = C[K,J] C_KJ = C_KJ[panel_p,:] C[K,J] = C_KJ end end ##permute() function trailing_update_par2(C, L_II, C_KI, i, j, n, flag, dep) if isa(dep, RemoteRef); wait(dep); end if flag #(C.dist[i+1] == n+2) ? (I = (C.dist[i]):n) : # (I = (C.dist[i]):(C.dist[i+1]-1)) I = C.dist[i]:n J = (n+1):(n+1) K = (I[length(I)]+1):n C_IJ = C[I,J] if length(K) > 0 C_KJ = C[K,J] else C_KJ = zeros(0) end ## Compute blocks of U C_IJ = L_II \ C_IJ C[I,J] = C_IJ else #(C.dist[i+1] == n+2) ? (I = (C.dist[i]):n) : # (I = (C.dist[i]):(C.dist[i+1]-1)) I = (C.dist[i]):(C.dist[i+1]-1) J = (C.dist[j]):(C.dist[j+1]-1) K = (I[length(I)]+1):n C_IJ = C[I,J] if length(K) > 0 C_KJ = C[K,J] else C_KJ = zeros(0) end ## Compute blocks of U C_IJ = L_II \ C_IJ C[I,J] = C_IJ ## Trailing submatrix update - All flops are here if !isempty(C_KJ) cm, ck = size(C_KI) cn = size(C_IJ,2) blas_gemm('N','N',cm,cn,ck,-1.0,C_KI,cm,C_IJ,ck,1.0,C_KJ,cm) #C_KJ = C_KJ - C_KI*C_IJ C[K,J] = C_KJ end end end ## trailing_update_par2() ## Test n*n matrix on np processors ## Prints 5 numbers that should be close to zero function test(n, np) A = rand(n,n); b = rand(n); X = (@elapsed x = A \ b); Y = (@elapsed y = hpl_par(A,b, max(1,div(n,np)))); Z = (@elapsed z = hpl_par2(A,b)); for i=1:(min(5,n)) print(z[i]-y[i], " ") end println() return (X,Y,Z) end ## test k times and collect average function test(n,np,k) sum1 = 0; sum2 = 0; sum3 = 0; for i = 1:k (X,Y,Z) = test(n,np) sum1 += X sum2 += Y sum3 += Z end return (sum1/k, sum2/k, sum3/k) end