#!/usr/bin/perl -w
###########################################################################
#
# Simple group-by implementation -- assumes already in sorted order.
#
# Copyright 2010,2011 Philip Johnson.
#
# This program is free software: you can redistribute it and/or modify
# it under the terms of the GNU General Public License as published
# by the Free Software Foundation, either version 3 of the License,
# or (at your option) any later version.
#
# This program is distributed in the hope that it will be useful,
# but WITHOUT ANY WARRANTY; without even the implied warranty of
# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
# GNU General Public License at for
# more details.
#
###########################################################################
use strict;
use Getopt::Long;
my (@by, @actions, $precision, $delim, $noWarn);
$noWarn = 0;
$delim = '(?: |\t)+';
$precision = 4; #default to 4
if (!GetOptions("delimiter=s" => \$delim,
"precision=i" => \$precision,
"noWarn" => \$noWarn,
"by=s" => sub { Parse(\@by, undef, $_[1]) },
"full" => sub { @by = (0) },
"identity=s" => sub { Parse(\@actions, \&identity, $_[1]) },
"debugdump" => sub {push @actions, [\&debugdump, 0] },
"num" => sub { push @actions, [sub {return scalar(@{$_[2]})}, 0] },
"avg|mean=s" => sub { Parse(\@actions, \&avg, $_[1]) },
"var=s" => sub { Parse(\@actions, \&var, $_[1]) },
"sd=s" => sub { Parse(\@actions, sub {return sqrt(var(@_))}, $_[1])},
"sem=s" => sub { Parse(\@actions,sub {
return(sqrt(var(@_)/scalar(@{$_[2]})))}, $_[1])},
"q1=s" => sub { Parse(\@actions, \&percentile, $_[1], 25) },
"q2|median=s" => sub { Parse(\@actions, \&percentile, $_[1], 50) },
"q3=s" => sub { Parse(\@actions, \&percentile, $_[1], 75) },
"percent|pct=s" => sub {
die "Option -percent needs ':'\n" if $_[1] !~ /\d+:\d+/;
Parse(\@actions, \&percentile, split(/:/,$_[1]))},
"min=i" => sub { push @actions, [\&percentile, $_[1], 0] },
"max=i" => sub { push @actions, [\&percentile, $_[1], 100] },
"wheremax=i" => sub { push @actions, [\&wheremax, $_[1]] },
"sum=i" => sub { push @actions, [\&sum, $_[1]] },
"mse=s" => sub { die "Option -mse needs ':'\n" if $_[1] !~ /\d+:.+/;
Parse(\@actions, \&mse, split(/:/,$_[1])) },
"rmse=s" => sub { die "Option -mse needs ':'\n" if $_[1] !~ /\d+:.+/;
Parse(\@actions, sub {return sqrt(mse(@_))},
split(/:/,$_[1])) },
"bias=s" => sub{ die "Option -bias needs ':'\n" if $_[1] !~/\d+:.+/;
push @actions, [\&bias, split(/:/,$_[1]) ] },
"cdf=s" => sub { die "Option -cdf needs ':'\n" if $_[1] !~/\d+:.+/;
push @actions, [\&cdf, split(/:/,$_[1]) ] }
) || @actions == 0) {
die "Syntax: $0 -by [-delimiter 'regex'] [-precision ] *
Where is any of the following:
\t-identity --> contents of first row of column
\t-num --> number of rows
\t-avg|mean --> average
\t-var --> variance (divided by n, not n-1)
\t-sd --> standard deviation
\t-sem --> standard error of the mean (sd\/sqrt(n))
\t-q1 --> first quartile
\t-q2|median --> median
\t-q3 --> third quartile
\t-percent : --> percentile
\t-min --> minimum value
\t-max --> maximum value
\t-wheremax --> line where col has maximum value
\t-sum --> sum of values
\t-mse : --> mean squared error relative to value of formula
\t-bias : --> mean - (value of formula)
and :=
:= | [,]+ | -
:= |
:= \$
\n";
}
@by = map {--$_} @by; #switch from 1-based to 0-based
if (@by == 1 && $by[0] == -1) {
@by = ();
}
{ # main program block
$_ = <> || exit 0;
chomp;
s/^\s+//; #kill leading whitespace
my $F = [split /$delim/];
my @data = ($F);
my $currSet = join(chr(0), @{$F}[@by]); #assumes NUL isn't used in fields!
my $numFields = @$F;
while (<>) {
chomp;
s/^\s+//; #kill leading whitespace
$F = [split /$delim/];
if (@$F != $numFields && !$noWarn) {
print STDERR "Warning: line $. has ".scalar(@$F).
" fields instead of the expected $numFields.\n";
}
my $key = join(chr(0), @{$F}[@by]); #assumes NUL isn't used in fields!
if ($key ne $currSet) {
Report(\@actions, \@data);
@data = ();
$currSet = $key;
$numFields = @$F;
}
push @data, $F;
}
Report(\@actions, \@data);
}
#-----------------------------------------------------------------------------
# PRE : array of actions, function, command line argument w/ column
# numbers for parsing. If undefined function, then just push column
# number.
# POST: columns parsed out with any of the following syntax:
# X [just column X]
# X,Y,Z [cols X, Y and Z]
# X-Z [cols X through Z]
# X..Z [cols X through Z]
sub Parse {
my ($array, $func, $clArg, $fArg) = @_;
foreach my $val (split(/,/, $clArg)) {
if ($val =~ /^(\d+)(?:-|\.{2})(\d+)$/) {
my ($start, $stop) = split /-|(?:\.\.)/, $val;
for (my $i = $start; $i <= $stop; ++$i) {
push @$array, (defined $func ? [$func, $i, $fArg] : $i);
}
} elsif ($val =~ /^\d+$/) {
push @$array, (defined $func ? [$func, $val, $fArg] : $val);
} else {
die "Invalid column specification: '$clArg'\n\n";
}
}
}
#-----------------------------------------------------------------------------
sub Report {
my ($actions, $data) = @_;
my $i = 0;
foreach my $act (@$actions) {
print "\t" if ++$i > 1;
#-1 below is to switch from 1-based (from user) to 0-based
# if ($act->[0] == \&identity ||
# $act->[0] == \&wheremax) {
# print &{$act->[0]}($act->[1]-1, $act->[2], $data);
# } else {
# print sprintf('%.'.$precision.'g', &{$act->[0]}($act->[1]-1, $act->[2], $data));
# }
my $res = &{$act->[0]}($act->[1]-1, $act->[2], $data);
if ($res =~ /^\d*\.\d*$/) {
print sprintf('%.'.$precision.'g', $res);
} else {
print $res;
}
}
print "\n";
}
#-----------------------------------------------------------------------------
sub identity ($$$) {
my ($col, $extra, $data) = @_;
return $data->[0][$col];
}
#-----------------------------------------------------------------------------
sub debugdump ($$$) {
my ($col, $extra, $data) = @_;
print "-----------\n";
for (my $i = 0; $i < @$data; ++$i) {
print join("\t", @{$data->[$i]}), "\n";
}
print "-----------\n";
return '-';
}
#-----------------------------------------------------------------------------
sub sum ($$$) {
my ($col, $extra, $data) = @_;
my $sum = 0;
for (my $i = 0; $i < @$data; ++$i) {
$sum += $data->[$i][$col];
}
return $sum;
}
#-----------------------------------------------------------------------------
sub avg ($$$) {
my ($col, $extra, $data) = @_;
my $sum = 0;
for (my $i = 0; $i < @$data; ++$i) {
$sum += $data->[$i][$col];
}
return $sum / @$data;
}
#-----------------------------------------------------------------------------
sub var ($$$) {
my ($col, $extra, $data) = @_;
my $sum = 0;
my $sumSqrs = 0;
for (my $i = 0; $i < @$data; ++$i) {
$sum += $data->[$i][$col];
$sumSqrs += $data->[$i][$col] * $data->[$i][$col];
}
my $v = $sumSqrs/@$data - ($sum/@$data)**2;
return ($v > 0 ? $v : 0);#correct for numerical errors near 0
}
#-----------------------------------------------------------------------------
sub percentile ($$$) {
my ($col, $percentile, $data) = @_;
my @sortDat = sort {$a->[$col] <=> $b->[$col]} @$data;
if ($percentile == 100) { #special case maximum
return $sortDat[@$data-1][$col];
} else {
return $sortDat[@$data * $percentile/100][$col];
}
}
#-----------------------------------------------------------------------------
sub wheremax ($$) {
my ($col, $extra, $data) = @_;
my @sortDat = sort {$a->[$col] <=> $b->[$col]} @$data;
return join("\t", @{$sortDat[@$data-1]});
}
#-----------------------------------------------------------------------------
sub x_TokenizeFormula($) {
my ($formula) = @_;
my @f = split /(\$)/, $formula;
shift @f if ($f[0] eq '');
pop @f if ($f[$#f] eq '');
return @f;
}
sub x_TranslateFormula($$) {
my ($f, $row) = @_;
if ($f->[0] eq '$') {
return $row->[$f->[1]-1];
} else {
return $f->[0];
}
}
#-----------------------------------------------------------------------------
sub mse ($$$) {
my ($col, $formula, $data) = @_;
my @f = x_TokenizeFormula($formula);
my $sum = 0;
for (my $i = 0; $i < @$data; ++$i) {
$sum += ($data->[$i][$col] - x_TranslateFormula(\@f, $data->[$i]))**2;
}
return $sum/@$data;
}
#-----------------------------------------------------------------------------
sub bias ($$$) {
my ($col, $formula, $data) = @_;
my @f = x_TokenizeFormula($formula);
return &avg(@_) - x_TranslateFormula(\@f, $data->[0]);
}
#-----------------------------------------------------------------------------
sub cdf ($$$) {
my ($col, $formula, $data) = @_;
my @f = x_TokenizeFormula($formula);
my $x = x_TranslateFormula(\@f, $data->[0]);
my $cnt = 0;
for (my $i = 0; $i < @$data; ++$i) {
# some versions of perl have a lousy concept of infinity
if ($data->[$i][$col] <= $x && $data->[$i][$col] ne 'inf') {
++$cnt;
}
}
return $cnt / @$data;
}