using DataFrames showln(x) = (show(x); println()) # TODO: needs more links to docs. # A DataFrame is an in-memory database df = DataFrame(A = [1, 2], B = [e, pi], C = ["xx", "xy"]) showln(df) #> 2x3 DataFrame #> |-------|---|---------|------| #> | Row # | A | B | C | #> | 1 | 1 | 2.71828 | "xx" | #> | 2 | 2 | 3.14159 | "xy" | # The columns of a DataFrame can be indexed using numbers or names showln(df[1]) #> [1,2] showln(df[:A]) #> [1,2] showln(df[2]) #> [2.718281828459045,3.141592653589793] showln(df[:B]) #> [2.718281828459045,3.141592653589793] showln(df[3]) #> ASCIIString["xx","xy"] showln(df[:C]) #> ASCIIString["xx","xy"] # The rows of a DataFrame can be indexed only by using numbers showln(df[1, :]) #> 1x3 DataFrame #> |-------|---|---------|------| #> | Row # | A | B | C | #> | 1 | 1 | 2.71828 | "xx" | showln(df[1:2, :]) #> 2x3 DataFrame #> |-------|---|---------|------| #> | Row # | A | B | C | #> | 1 | 1 | 2.71828 | "xx" | #> | 2 | 2 | 3.14159 | "xy" | # DataFrames can be loaded from CSV files using readtable() iris = readtable("iris.csv") # Check the names and element types of the columns of our new DataFrame showln(names(iris)) #> [:SepalLength,:SepalWidth,:PetalLength,:PetalWidth,:Species] showln(eltypes(iris)) #> Type[Float64,Float64,Float64,Float64,UTF8String] # Subset the DataFrame to only include rows for one species showln(iris[iris[:Species] .== "setosa", :]) #> 50x5 DataFrame #> |-------|-------------|------------|-------------|------------|----------| #> | Row # | SepalLength | SepalWidth | PetalLength | PetalWidth | Species | #> | 1 | 5.1 | 3.5 | 1.4 | 0.2 | "setosa" | #> | 2 | 4.9 | 3.0 | 1.4 | 0.2 | "setosa" | #> | 3 | 4.7 | 3.2 | 1.3 | 0.2 | "setosa" | #> | 4 | 4.6 | 3.1 | 1.5 | 0.2 | "setosa" | #> | 5 | 5.0 | 3.6 | 1.4 | 0.2 | "setosa" | #> | 6 | 5.4 | 3.9 | 1.7 | 0.4 | "setosa" | #> | 7 | 4.6 | 3.4 | 1.4 | 0.3 | "setosa" | #> | 8 | 5.0 | 3.4 | 1.5 | 0.2 | "setosa" | #> | 9 | 4.4 | 2.9 | 1.4 | 0.2 | "setosa" | #> ⋮ #> | 41 | 5.0 | 3.5 | 1.3 | 0.3 | "setosa" | #> | 42 | 4.5 | 2.3 | 1.3 | 0.3 | "setosa" | #> | 43 | 4.4 | 3.2 | 1.3 | 0.2 | "setosa" | #> | 44 | 5.0 | 3.5 | 1.6 | 0.6 | "setosa" | #> | 45 | 5.1 | 3.8 | 1.9 | 0.4 | "setosa" | #> | 46 | 4.8 | 3.0 | 1.4 | 0.3 | "setosa" | #> | 47 | 5.1 | 3.8 | 1.6 | 0.2 | "setosa" | #> | 48 | 4.6 | 3.2 | 1.4 | 0.2 | "setosa" | #> | 49 | 5.3 | 3.7 | 1.5 | 0.2 | "setosa" | #> | 50 | 5.0 | 3.3 | 1.4 | 0.2 | "setosa" | # Count the number of rows for each species showln(by(iris, :Species, df -> size(df, 1))) #> 3x2 DataFrame #> |-------|--------------|----| #> | Row # | Species | x1 | #> | 1 | "setosa" | 50 | #> | 2 | "versicolor" | 50 | #> | 3 | "virginica" | 50 | # Discretize entire columns at a time iris[:SepalLength] = iround(iris[:SepalLength]) iris[:SepalWidth] = iround(iris[:SepalWidth]) # Tabulate data according to discretized columns to see "clusters" tabulated = by( iris, [:Species, :SepalLength, :SepalWidth], df -> size(df, 1) ) showln(tabulated) #> 17x4 DataFrame #> |-------|--------------|-------------|------------|----| #> | Row # | Species | SepalLength | SepalWidth | x1 | #> | 1 | "setosa" | 4 | 3 | 4 | #> | 2 | "setosa" | 5 | 2 | 1 | #> | 3 | "setosa" | 5 | 3 | 23 | #> | 4 | "setosa" | 5 | 4 | 17 | #> | 5 | "setosa" | 6 | 4 | 5 | #> | 6 | "versicolor" | 5 | 2 | 3 | #> | 7 | "versicolor" | 5 | 3 | 3 | #> | 8 | "versicolor" | 6 | 2 | 6 | #> | 9 | "versicolor" | 6 | 3 | 29 | #> | 10 | "versicolor" | 7 | 3 | 9 | #> | 11 | "virginica" | 5 | 3 | 1 | #> | 12 | "virginica" | 6 | 2 | 1 | #> | 13 | "virginica" | 6 | 3 | 22 | #> | 14 | "virginica" | 7 | 3 | 19 | #> | 15 | "virginica" | 7 | 4 | 1 | #> | 16 | "virginica" | 8 | 3 | 4 | #> | 17 | "virginica" | 8 | 4 | 2 |