Skip to content

Commit

Permalink
Update some docs and benchmarks
Browse files Browse the repository at this point in the history
  • Loading branch information
davidanthoff committed Aug 30, 2018
1 parent f60b257 commit e6ab23d
Show file tree
Hide file tree
Showing 7 changed files with 40 additions and 80 deletions.
2 changes: 1 addition & 1 deletion CONTRIBUTING.md
Original file line number Diff line number Diff line change
Expand Up @@ -4,4 +4,4 @@ site: https://discourse.julialang.org/c/domain/data. I use the GitHub
issue tracker for bug reports and feature requests only.

By contributing code to Query.jl, you are agreeing to release it under
the [MIT License](https://github.com/davidanthoff/Query.jl/blob/master/LICENSE.md).
the [MIT License](https://github.com/queryverse/Query.jl/blob/master/LICENSE.md).
60 changes: 30 additions & 30 deletions benchmark/Rdatatable.jl
Original file line number Diff line number Diff line change
Expand Up @@ -132,52 +132,52 @@ function benches(df::DataFrame)

ti[:sum1] = @elapsed @from i in df begin
@group i by i.id1 into g
@select {r=sum(g..v1)}
@select {r=sum(g.v1)}
@collect DataFrame
end
ti[:sum2] = @elapsed @from i in df begin
@group i by i.id1 into g
@select {r=sum(g..v1)}
@select {r=sum(g.v1)}
@collect DataFrame
end
ti[:sum3] = @elapsed @from i in df begin
@group i by (i.id1,i.id2) into g
@select {r=sum(g..v1)}
@select {r=sum(g.v1)}
@collect DataFrame
end
ti[:sum4] = @elapsed @from i in df begin
@group i by (i.id1,i.id2) into g
@select {r=sum(g..v1)}
@select {r=sum(g.v1)}
@collect DataFrame
end
ti[:sum_mean1] = @elapsed @from i in df begin
@group i by i.id3 into g
@select {s=sum(g..v1),m=mean(g..v3)}
@select {s=sum(g.v1),m=mean(g.v3)}
@collect DataFrame
end
ti[:sum_mean2] = @elapsed @from i in df begin
@group i by i.id3 into g
@select {s=sum(g..v1),m=mean(g..v3)}
@select {s=sum(g.v1),m=mean(g.v3)}
@collect DataFrame
end
ti[:mean7_9_by_id4_1] = @elapsed @from i in df begin
@group i by i.id4 into g
@select {m7=mean(g..v1),m8=mean(g..v2),m9=mean(g..v3)}
@select {m7=mean(g.v1),m8=mean(g.v2),m9=mean(g.v3)}
@collect DataFrame
end
ti[:mean7_9_by_id4_2] = @elapsed @from i in df begin
@group i by i.id4 into g
@select {m7=mean(g..v1),m8=mean(g..v2),m9=mean(g..v3)}
@select {m7=mean(g.v1),m8=mean(g.v2),m9=mean(g.v3)}
@collect DataFrame
end
ti[:sum7_9_by_id6_1] = @elapsed @from i in df begin
@group i by i.id6 into g
@select {m7=mean(g..v1),m8=mean(g..v2),m9=mean(g..v3)}
@select {m7=mean(g.v1),m8=mean(g.v2),m9=mean(g.v3)}
@collect DataFrame
end
ti[:sum7_9_by_id6_2] = @elapsed @from i in df begin
@group i by i.id6 into g
@select {m7=mean(g..v1),m8=mean(g..v2),m9=mean(g..v3)}
@select {m7=mean(g.v1),m8=mean(g.v2),m9=mean(g.v3)}
@collect DataFrame
end
return ti
Expand All @@ -189,52 +189,52 @@ function benches(df::DataTable)

ti[:sum1] = @elapsed @from i in df begin
@group i by i.id1 into g
@select {r=sum(g..v1)}
@select {r=sum(g.v1)}
@collect DataFrame
end
ti[:sum2] = @elapsed @from i in df begin
@group i by i.id1 into g
@select {r=sum(g..v1)}
@select {r=sum(g.v1)}
@collect DataFrame
end
ti[:sum3] = @elapsed @from i in df begin
@group i by (i.id1,i.id2) into g
@select {r=sum(g..v1)}
@select {r=sum(g.v1)}
@collect DataFrame
end
ti[:sum4] = @elapsed @from i in df begin
@group i by (i.id1,i.id2) into g
@select {r=sum(g..v1)}
@select {r=sum(g.v1)}
@collect DataFrame
end
ti[:sum_mean1] = @elapsed @from i in df begin
@group i by i.id3 into g
@select {s=sum(g..v1),m=mean(g..v3)}
@select {s=sum(g.v1),m=mean(g.v3)}
@collect DataFrame
end
ti[:sum_mean2] = @elapsed @from i in df begin
@group i by i.id3 into g
@select {s=sum(g..v1),m=mean(g..v3)}
@select {s=sum(g.v1),m=mean(g.v3)}
@collect DataFrame
end
ti[:mean7_9_by_id4_1] = @elapsed @from i in df begin
@group i by i.id4 into g
@select {m7=mean(g..v1),m8=mean(g..v2),m9=mean(g..v3)}
@select {m7=mean(g.v1),m8=mean(g.v2),m9=mean(g.v3)}
@collect DataFrame
end
ti[:mean7_9_by_id4_2] = @elapsed @from i in df begin
@group i by i.id4 into g
@select {m7=mean(g..v1),m8=mean(g..v2),m9=mean(g..v3)}
@select {m7=mean(g.v1),m8=mean(g.v2),m9=mean(g.v3)}
@collect DataFrame
end
ti[:sum7_9_by_id6_1] = @elapsed @from i in df begin
@group i by i.id6 into g
@select {m7=mean(g..v1),m8=mean(g..v2),m9=mean(g..v3)}
@select {m7=mean(g.v1),m8=mean(g.v2),m9=mean(g.v3)}
@collect DataFrame
end
ti[:sum7_9_by_id6_2] = @elapsed @from i in df begin
@group i by i.id6 into g
@select {m7=mean(g..v1),m8=mean(g..v2),m9=mean(g..v3)}
@select {m7=mean(g.v1),m8=mean(g.v2),m9=mean(g.v3)}
@collect DataFrame
end
return ti
Expand All @@ -246,52 +246,52 @@ function benches(df::IndexedTable)

ti[:sum1] = @elapsed @from i in df begin
@group i by i.id1 into g
@select {r=sum(g..v1)}
@select {r=sum(g.v1)}
@collect DataFrame
end
ti[:sum2] = @elapsed @from i in df begin
@group i by i.id1 into g
@select {r=sum(g..v1)}
@select {r=sum(g.v1)}
@collect DataFrame
end
ti[:sum3] = @elapsed @from i in df begin
@group i by (i.id1,i.id2) into g
@select {r=sum(g..v1)}
@select {r=sum(g.v1)}
@collect DataFrame
end
ti[:sum4] = @elapsed @from i in df begin
@group i by (i.id1,i.id2) into g
@select {r=sum(g..v1)}
@select {r=sum(g.v1)}
@collect DataFrame
end
ti[:sum_mean1] = @elapsed @from i in df begin
@group i by i.id3 into g
@select {s=sum(g..v1),m=mean(g..v3)}
@select {s=sum(g.v1),m=mean(g.v3)}
@collect DataFrame
end
ti[:sum_mean2] = @elapsed @from i in df begin
@group i by i.id3 into g
@select {s=sum(g..v1),m=mean(g..v3)}
@select {s=sum(g.v1),m=mean(g.v3)}
@collect DataFrame
end
ti[:mean7_9_by_id4_1] = @elapsed @from i in df begin
@group i by i.id4 into g
@select {m7=mean(g..v1),m8=mean(g..v2),m9=mean(g..v3)}
@select {m7=mean(g.v1),m8=mean(g.v2),m9=mean(g.v3)}
@collect DataFrame
end
ti[:mean7_9_by_id4_2] = @elapsed @from i in df begin
@group i by i.id4 into g
@select {m7=mean(g..v1),m8=mean(g..v2),m9=mean(g..v3)}
@select {m7=mean(g.v1),m8=mean(g.v2),m9=mean(g.v3)}
@collect DataFrame
end
ti[:sum7_9_by_id6_1] = @elapsed @from i in df begin
@group i by i.id6 into g
@select {m7=mean(g..v1),m8=mean(g..v2),m9=mean(g..v3)}
@select {m7=mean(g.v1),m8=mean(g.v2),m9=mean(g.v3)}
@collect DataFrame
end
ti[:sum7_9_by_id6_2] = @elapsed @from i in df begin
@group i by i.id6 into g
@select {m7=mean(g..v1),m8=mean(g..v2),m9=mean(g..v3)}
@select {m7=mean(g.v1),m8=mean(g.v2),m9=mean(g.v3)}
@collect DataFrame
end
return ti
Expand Down
4 changes: 2 additions & 2 deletions benchmark/benchmarks.jl
Original file line number Diff line number Diff line change
Expand Up @@ -18,13 +18,13 @@ using DataTables

@bench "two columns" @from i in $dt begin
@group {i.A, i.B} by i.B into g
@select {m = mean(g..A)}
@select {m = mean(g.A)}
@collect
end

@bench "three columns" @from i in $dt begin
@group {i.A, i.B, i.C} by i.B into g
@select {m = mean(g..A)}
@select {m = mean(g.A)}
@collect
end
end
Expand Down
29 changes: 2 additions & 27 deletions docs/src/experimental.md
Original file line number Diff line number Diff line change
Expand Up @@ -19,14 +19,14 @@ df = DataFrame(a=[1,1,2,3], b=[4,5,6,8])

df2 = df |>
@groupby(_.a) |>
@map({a=_.key, b=mean(_..b)}) |>
@map({a=key(_), b=mean(_.b)}) |>
@filter(_.b > 5) |>
@orderby_descending(_.b) |>
DataFrame
```

This example makes use of three experimental features: 1) the standalone
query commands, 2) the `..` syntax and 3) the `_` anonymous function syntax.
query commands, 2) the `.` syntax and 3) the `_` anonymous function syntax.

## Standalone query operators

Expand Down Expand Up @@ -137,31 +137,6 @@ The `@take` command has the form `@take(source, n)`. `source` can be any source

The `@drop` command has the form `@drop(source, n)`. `source` can be any source that can be queried. `n` must be an integer, and it specifies how many elements from the beginning of the source should be dropped from the results.

## The `..` syntax

The syntax `a..b` is translated into `map(i->i.b, a)` in any query
expression. This is especially helpful when computing some reduction of
a given column of a grouped table.

For example, the following command groups a table by column `a`, and then
computes the mean of the `b` column for each group:

```julia
using DataFrames, Query

df = DataFrame(a=[1,1,2,3], b=[4,5,6,8])

@from i in df begin
@group i by i.a into g
@select {a=i.key, b=mean(g..b)}
@collect DataFrame
end
```

The `@group` command here creates a list of tables, i.e. `g` will hold
a full table for each group. The syntax `g..b` then extracts a single
column from that table.

## The `_` and `__` syntax

This syntax only works in the standalone query commands. Instead of writing
Expand Down
2 changes: 1 addition & 1 deletion docs/src/gettingstarted.md
Original file line number Diff line number Diff line change
Expand Up @@ -42,7 +42,7 @@ The Query package does not require data sources or sinks to have a table like st
## Missing values

Missing values are represented as `DataValue` types from the
[DataValues.jl](https://github.com/davidanthoff/DataValues.jl) package.
[DataValues.jl](https://github.com/queryverse/DataValues.jl) package.
Here are some usage tips.

All arithmetic operators work automatically with missing values.
Expand Down
17 changes: 1 addition & 16 deletions docs/src/index.md
Original file line number Diff line number Diff line change
Expand Up @@ -2,22 +2,7 @@

## Overview

Query is a package for querying julia data sources. It can filter, project, join and group data from any iterable data source, including all the sources supported in [IterableTables.jl](https://github.com/davidanthoff/IterableTables.jl). One can for example query any of the following data sources:
any array,
[DataFrames](https://github.com/JuliaStats/DataFrames.jl),
[DataStreams](https://github.com/JuliaData/DataStreams.jl)
(including [CSV](https://github.com/JuliaData/CSV.jl),
[Feather](https://github.com/JuliaStats/Feather.jl),
[SQLite](https://github.com/JuliaDB/SQLite.jl),
[ODBC](https://github.com/JuliaDB/ODBC.jl)),
[DataTables](https://github.com/JuliaData/DataTables.jl),
[IndexedTables](https://github.com/JuliaComputing/IndexedTables.jl),
[TimeSeries](https://github.com/JuliaStats/TimeSeries.jl),
[Temporal](https://github.com/dysonance/Temporal.jl),
[TypedTables](https://github.com/FugroRoames/TypedTables.jl) and
[DifferentialEquations](https://github.com/JuliaDiffEq/DifferentialEquations.jl) (any `DESolution`).

The package currently provides working implementations for in-memory data sources, but will eventually be able to translate queries into e.g. SQL. There is a prototype implementation of such a "query provider" for [SQLite](https://github.com/JuliaDB/SQLite.jl) in the package, but it is experimental at this point and only works for a *very* small subset of queries.
Query is a package for querying julia data sources. It can filter, project, join and group data from any iterable data source, including all the sources supported in [IterableTables.jl](https://github.com/queryverse/IterableTables.jl).

Query is heavily inspired by [LINQ](https://msdn.microsoft.com/en-us/library/bb397926.aspx), in fact right now the package is largely an implementation of the [LINQ](https://msdn.microsoft.com/en-us/library/bb397926.aspx) part of the [C# specification](https://msdn.microsoft.com/en-us/library/ms228593.aspx). Future versions of Query will most likely add features that are not found in the original [LINQ](https://msdn.microsoft.com/en-us/library/bb397926.aspx) design.

Expand Down
6 changes: 3 additions & 3 deletions docs/src/querycommands.md
Original file line number Diff line number Diff line change
Expand Up @@ -268,7 +268,7 @@ df = DataFrame(name=["John", "Sally", "Kirk"], age=[23., 42., 59.], children=[3,
x = @from i in df begin
@group i by i.children into g
@select {Key=g.key,Count=length(g)}
@select {Key=key(g),Count=length(g)}
@collect DataFrame
end
Expand All @@ -285,7 +285,7 @@ println(x)

## Split-Apply-Combine (a.k.a. `dplyr`)

`Query.jl` provides special syntax to summarise data in a `Query.Grouping` as above. *Summarising* here is synonymous to *aggregating* or *collapsing* the dataset over a certain grouping variable. Summarising thus requires an aggregating function like `mean`, `maximum`, or any other function that takes a vector and returns a scalar. The special syntax is `@select new_var = agg_fun(g..var)`, where `agg_fun` is your aggregation function (e.g. `mean`), `g` is your grouping, and `var` is the relevant column that you want to summarise.
`Query.jl` provides special syntax to summarise data in a `Query.Grouping` as above. *Summarising* here is synonymous to *aggregating* or *collapsing* the dataset over a certain grouping variable. Summarising thus requires an aggregating function like `mean`, `maximum`, or any other function that takes a vector and returns a scalar. The special syntax is `@select new_var = agg_fun(g.var)`, where `agg_fun` is your aggregation function (e.g. `mean`), `g` is your grouping, and `var` is the relevant column that you want to summarise.

#### Example

Expand All @@ -298,7 +298,7 @@ df = DataFrame(name=repeat(["John", "Sally", "Kirk"],inner=[1],outer=[2]),
x = @from i in df begin
@group i by i.state into g
@select {group=g.key,mage=mean(g..age), oldest=maximum(g..age), youngest=minimum(g..age)}
@select {group=key(g),mage=mean(g.age), oldest=maximum(g.age), youngest=minimum(g.age)}
@collect DataFrame
end
Expand Down

0 comments on commit e6ab23d

Please sign in to comment.