#!/usr/bin/perl

eval 'exec /usr/bin/perl -w -S $0 ${1+"$@"}'
    if 0; # not running under some shell

#
# MogileFS storage node daemon
#  (perlbal front-end)
#
# (c) 2004, Brad Fitzpatrick, <brad@danga.com>
# (c) 2006, Six Apart, Ltd.

use strict;
use lib 'lib';
use IO::Socket::INET;
use POSIX qw(ENOENT EACCES EBADF WNOHANG);
use Fcntl qw(SEEK_CUR SEEK_SET SEEK_END O_RDWR O_CREAT O_TRUNC);
use Perlbal 1.52;

# verify their Linux::AIO or IO::AIO works.  Perlbal 1.51 does this,
# but just copying it here so people don't need to upgrade for this
# one check.  also because the rules are different:  in Perlbal
# it's understandable to not have working in AIO, in mogstored
# it's essentially required, except for dev and light testing.
BEGIN {
    my $OPTMOD_IO_AIO        = eval "use IO::AIO 1.6 (); 1;";
    my $OPTMOD_LINUX_AIO     = eval "use Linux::AIO 1.71 (); 1;";
    if ($OPTMOD_LINUX_AIO) {
        my $good = 0;
        Linux::AIO::aio_open("/tmp/$$-" . rand() . "-bogusdir/bogusfile-$$", O_RDWR|O_CREAT|O_TRUNC, 0, sub {
            $good = 1 if $_[0] < 0 && $! == ENOENT;
        });
        while (Linux::AIO::nreqs()) {
            my $rfd = "";
            vec ($rfd, Linux::AIO::poll_fileno(), 1) = 1;
            select $rfd, undef, undef, undef;
            Linux::AIO::poll_cb();
        }
        unless ($good) {
            # pretend that they don't have Linux::AIO, but only bitch at them if they don't have IO::AIO ...
            if ($OPTMOD_IO_AIO) {
                $Perlbal::AIO_MODE = "ioaio";
            } else {
                warn("WARNING:  Your installation of Linux::AIO doesn't work.\n".
                     "          You seem to have installed it without 'make test',\n".
                     "          or you ignored the failing tests.  I'm going to ignore\n".
                     "          that you have it and proceed without async IO.  The\n".
                     "          modern replacement to Linux::AIO is IO::AIO.\n");
            }
            $OPTMOD_LINUX_AIO = 0;
        }
    }
    unless ($OPTMOD_LINUX_AIO || $OPTMOD_IO_AIO) {
        if ($ENV{'MOGSTORED_RUN_WITHOUT_AIO'}) {
            warn("WARNING:  Running without async IO.  Won't run well with many clients.\n");
        } else {
            die("ERROR: IO::AIO not installed, so async IO not available.  Refusing to run\n".
                "       unless you set the environment variable MOGSTORED_RUN_WITHOUT_AIO=1\n");
        }
    }

}

# State:
my %on_death;             # pid -> subref (to run when pid dies)
my %devnum_to_device;     # mogile device number (eg. 'dev1' would be '1') -> os device path (eg. '/dev/rd0')
my %osdevnum_to_device;   # os device number (fetched via stat(file)[0]) -> os device path (ec. '/dev/rd0')
my %iostat_listeners;     # fd => SideChannel client: clients interested in iostat data.
my $iostat_available = 1; # bool: iostat working.  assume working to start.
my ($iostat_pipe_r, $iostat_pipe_w);  # pipes for talking to iostat process

# Config:
my $opt_daemonize;
my $opt_config;
my $opt_iostat = 1;  # default to on now
my $max_conns = 10000;
my $http_listen = "0.0.0.0:7500";
my $mgmt_listen = "0.0.0.0:7501";
my $docroot     = "/var/mogdata";
my $default_config = "/etc/mogilefs/mogstored.conf";
my %config_opts = (
                   'iostat'       => \$opt_iostat,
                   'daemonize|d'  => \$opt_daemonize,
                   'config=s'     => \$opt_config,
                   'httplisten=s' => \$http_listen,
                   'mgmtlisten=s' => \$mgmt_listen,
                   'docroot=s'    => \$docroot,
                   'maxconns=i'    => \$max_conns,
                   );
usage() unless Getopt::Long::GetOptions(%config_opts);

$opt_config = $default_config if ! $opt_config && -e $default_config;
load_config_file($opt_config => \%config_opts) if $opt_config;

# use AIO channels in Perlbal
Perlbal::AIO::set_file_to_chan_hook(sub {
    my $filename = shift;
    $filename =~ m{/dev(\d+)\b} or return undef;
    return "dev$1";
});

# this is the perlbal configuration only.  not the mogstored configuration.
my $pb_conf = "
SERVER max_connections = $max_conns
CREATE SERVICE mogstored
   SET role = web_server
   SET listen = $http_listen
   SET docroot = $docroot
   SET dirindexing = 0
   SET enable_put = 1
   SET enable_delete = 1
   SET min_put_directory = 1
   SET persist_client = 1
ENABLE mogstored
";

Perlbal::run_manage_commands($pb_conf, sub { print STDERR "$_[0]\n"; });

unless (Perlbal::Socket->WatchedSockets > 0) {
    die "Invalid configuration.  (shouldn't happen?)  Stopping.\n";
}

if ($opt_daemonize) {
    die "mogstored won't daemonize with \$ENV{MOGSTORED_RUN_WITHOUT_AIO} set.\n" if $ENV{'MOGSTORED_RUN_WITHOUT_AIO'};
    Perlbal::daemonize();
} else {
    print "Running.\n";
}

# set number of AIO threads, between 10-100 (for some reason, have to
# set aio threads after daemonizing)
{
    my $aio_threads = aio_threads(disks($docroot));
    Perlbal::run_manage_commands("SERVER aio_threads = $aio_threads", sub { print STDERR "$_[0]\n"; });
}

# kill our children processes on exit:
my $parent_pid = $$;
$SIG{TERM} = sub {
    if ($$ == $parent_pid) {
        kill 9, grep { $_ } keys %on_death;
        exit(0);
    }
};

setup_iostat_pipes();
start_disk_usage_process();
start_iostat_process() if $opt_iostat;
harvest_dead_children();  # every 2 seconds, it reschedules itself
setup_sidechannel_listener();

# now start the main loop
Perlbal::run();


############################################################################
# main:: functions
############################################################################

sub usage {
    my $note = shift;
    $note = $note ? "NOTE: $note\n\n" : "";

    die "${note}Usage: mogstored [OPTS]

OPTS:
 --daemonize  -d        Daemonize
 --config=<file>        Set config file (default is /etc/mogilefs/mogstored.conf)
 --httplisten=<ip:port> IP/Port HTTP server listens on
 --mgmtlisten=<ip:port> IP/Port management/sidechannel listens on
 --docroot=<path>       Docroot above device mount points.  Defaults to /var/mogdata
";

}

sub load_config_file {
    my ($conffile, $opts) = @_;

    # parse the mogstored config file, which is just lines of comments and
    # "key = value" lines, where keys are just the same as commandline
    # options.
    die "Config file $opt_config doesn't exist.\n" unless -e $conffile;
    open my $fh, $conffile or die "Couldn't open config file for reading: $!";
    while (<$fh>) {
        s/\#.*//;
        next unless /\S/;
        if (/SERVER max_connect/i || /CREATE SERVICE/i) {
            usage("Your $opt_config file is the old syntax.  The new format is simply lines of <key> = <value> where keys are the same as mogstored's command line options.");

        }
        die "Unknown config syntax: $_\n" unless /^\s*(\w+)\s*=\s*(.+?)\s*$/;
        my ($key, $val) = ($1, $2);
        my $dest;
        foreach my $ck (keys %$opts) {
            next unless $ck =~ /^$key\b/;
            $dest = $opts->{$ck};
        }
        die "Unknown config setting: $key\n" unless $dest;
        $$dest = $val;
    }
}

sub harvest_dead_children {
    my $dead = waitpid(-1, WNOHANG);
    if ($dead > 0) {
        my $code = delete $on_death{$dead};
        $code->() if $code;
    }
    Danga::Socket->AddTimer(2, \&harvest_dead_children);
}

# returns $pid of child, if parent, else runs child.
sub start_disk_usage_process {
    my $child = fork;
    unless (defined $child) {
        Perlbal::log('crit', "Fork error creating disk usage tracking process");
        return undef;
    }

    # if we're the parent.
    if ($child) {
        $on_death{$child} = sub {
            start_disk_usage_process();  # start a new one
        };
        return $child;
    }

    # else, we're the child..
    $0 = "mogstored [diskusage]";
    while (1) {
        look_at_disk_usage();
        sleep 30;
    }
}

sub look_at_disk_usage {
    my $err = sub { Perlbal::log('crit', $_[0]); 1; };
    my $path = Perlbal->service('mogstored')->{docroot};
    $path =~ s!/$!!;

    # find all devices below us
    my @devnum;
    if (opendir(D, $path)) {
        @devnum = grep { /^dev\d+$/ } readdir(D);
        closedir(D);
    } else {
        return $err->("Failed to open $path: $!");
    }

    foreach my $devnum (@devnum) {
        my $rval = `df -P -l -k $path/$devnum`;
        my $uperK = ($rval =~ /512-blocks/i) ? 2 : 1; # units per kB
        foreach my $l (split /\r?\n/, $rval) {
            next unless $l =~ /^(.+)\s+(\d+)\s+(\d+)\s+(\d+)\s+(.+)\s+(.+)$/;
            my ($dev, $total, $used, $avail, $useper, $disk) = ($1, $2, $3, $4, $5, $6);

            unless ($disk =~ m!$devnum/?$!) {
                $disk = "$path/$devnum";
            }

            # create string to print
            my $now = time;
            my $output = {
                time      => time(),
                device    => $dev,    # /dev/sdh1
                total     => int($total / $uperK), # integer: total KiB blocks
                used      => int($used  / $uperK), # integer: used KiB blocks
                available => int($avail / $uperK),  # integer: available KiB blocks
                'use'     => $useper, # "45%"
                disk      => $disk,   # mount point of disk (/var/mogdata/dev8), or path if not a mount
            };

            # open a file on that disk location called 'usage'
            my $rv = open(FILE, ">$disk/usage");
            unless ($rv) {
                return $err->("Unable to open '$disk/usage' for writing: $!");
                next;
            }
            foreach (sort keys %$output) {
                print FILE "$_: $output->{$_}\n";
            }
            close FILE;
        }
    }
}

sub iostat_subscribe {
    my $sock = shift;
    $iostat_listeners{fileno($sock->sock)} = $sock;
}

sub iostat_unsubscribe {
    my $sock = shift;
    my $fdno = fileno($sock->sock);
    return unless defined $fdno;
    delete $iostat_listeners{$fdno};
}

sub setup_sidechannel_listener {
    # setup a new socket for handling size requests
    my $server = IO::Socket::INET->new(LocalAddr => $mgmt_listen,
                                       Type      => SOCK_STREAM,
                                       Proto     => 'tcp',
                                       Blocking  => 0,
                                       Reuse     => 1,
                                       Listen    => 10 )
        or die "Error creating management socket: $@\n";

    # in Perl 5.6, we weren't always seeing this turned off by IO::Socket
    # so we have to do it manually here just to be sure.
    IO::Handle::blocking($server, 0);

    # accept handler for new workers
    my $accept_handler = sub {
        my $csock = $server->accept or return;
        IO::Handle::blocking($csock, 0);
        my $client = SideChannelClient->new($csock);
        $client->watch_read(1);
    };

    # add to fd list so this one gets processed
    Perlbal::Socket->AddOtherFds(fileno($server) => $accept_handler);
}

my $iostat_read_buf = "";
sub setup_iostat_pipes {
    pipe ($iostat_pipe_r, $iostat_pipe_w);
    IO::Handle::blocking($iostat_pipe_r, 0);
    IO::Handle::blocking($iostat_pipe_w, 0);

    Danga::Socket->AddOtherFds(fileno($iostat_pipe_r), sub {
        read_from_iostat_child();
    });

}

sub start_iostat_process {
    my $pid = fork;
    unless (defined $pid) {
        warn "Fork for iostat failed: $!";
        return;
    }

    if ($pid) {
        # Parent
        $on_death{$pid} = sub {
            start_iostat_process();
        };
        return;
    }

    # (runs in child process)
    $0 = "mogstored [iostat]";

    close STDIN;
    close STDOUT;
    close STDERR;

    # We may not be able to see errors beyond this point
    open STDIN, '<', '/dev/null'       or die "Couldn't open STDIN for reading from /dev/null";
    open STDOUT, '>&', $iostat_pipe_w  or die "Couldn't dup pipe for use as STDOUT";
    open STDERR, '>', '/dev/null'      or die "Couldn't open STDOUT for writing to /dev/null";

    select((select(STDOUT), $|++)[0]);

    my $get_iostat_fh = sub {
        while (1) {
            if (open (my $fh, "iostat -dx 1 30|")) {
                return $fh;
            }
            # TODO: try and find other paths to iostat
            warn "Failed to open iostat: $!\n"; # this will just go to /dev/null, but will be straceable
            sleep 10;
        }
    };

    while (1) {
        my $iofh = $get_iostat_fh->();
        my $mog_sysid = mog_sysid_map();  # 5 (mogdevid) -> 2340 (os devid)
        my $dev_sysid = {};  # hashref, populated lazily:  { /dev/sdg => system dev_t }
        my %devt_util;  # dev_t => 52.55
        my $init = 0;
        while (<$iofh>) {
            if (m/^Device:/) {
                %devt_util = ();
                $init = 1;
                next;
            }
            next unless $init;
            if (m/^ (\S+) .*? ([\d.]+) \n/x) {
                my ($devnode, $util) = ("/dev/$1", $2);
                unless (exists $dev_sysid->{$devnode}) {
                    $dev_sysid->{$devnode} = (stat($devnode))[6]; # rdev
                }
                my $devt = $dev_sysid->{$devnode};
                $devt_util{$devt} = $util;
                next;
            }
            # blank line is the end.
            if (m!^\s*\n!) {
                $init = 0;
                my $ret = "";
                foreach my $mogdevid (sort { $a <=> $b } keys %$mog_sysid) {
                    my $devt = $mog_sysid->{$mogdevid};
                    my $ut = defined $devt_util{$devt} ? $devt_util{$devt} : "-";
                    $ret .= "$mogdevid\t$ut\n";
                }
                $ret .= ".\n";
                print $ret;
                next;
            }
        }
    }
}

# (runs in iostat child process)
#  returns hashref of { 5 => dev_t device }  # mog_devid -> os_devid
sub mog_sysid_map {
    my $path = Perlbal->service('mogstored')->{docroot};
    $path =~ s!/$!!;

    # find all devices below us
    my @devnum;  # integer ids
    opendir(my $d, $path) or die "Failed to open docroot: $path: $!";
    @devnum = map { /^dev(\d+)$/ ? $1 : () } readdir($d);

    my $map = {};
    foreach my $mogdevid (@devnum) {
        my ($osdevid) = (stat("$path/dev$mogdevid"))[0];
        $map->{$mogdevid} = $osdevid;
    }
    return $map;
}

# (runs in parent event-loop process)
sub read_from_iostat_child {
    my $data;
    my $rv = sysread($iostat_pipe_r, $data, 10240);
    return unless $rv && $rv > 0;

    $iostat_read_buf .= $data;

    # only write complete lines to sockets (in case for some reason we get
    # a partial read and child process dies...)
    while ($iostat_read_buf =~ s/(.+)\r?\n//) {
        my $line = $1;
        foreach my $out_sock (values %iostat_listeners) {
            # where $line will be like "dev53\t53.23" or a "." to signal end of a group of devices.
            $out_sock->write("$line\n");
        }
    }
}

sub disks {
    my $root = shift;
    opendir(my $dh, $root) or die "Failed to open docroot: $root: $!";
    return scalar grep { /^dev\d+$/ } readdir($dh);
}

# returns aio threads to use, given a disk count
sub aio_threads {
    my $disks = shift;
    my $threads = ($disks || 1) * 10;
    return 100 if $threads > 100;
    return $threads;
}

#############################################################################
### simple package for handling the stream request port
package SideChannelClient;

use strict;
use base qw{Danga::Socket};
use fields (
            'count',      # how many requests we've serviced
            'read_buf',   # unprocessed read buffer
            'mogsvc',     # the mogstored Perlbal::Service object
            );

# needed since we're pretending to be a Perlbal::Socket... never idle out
sub max_idle_time { return 0; }

sub new {
    my SideChannelClient $self = shift;
    $self = fields::new($self) unless ref $self;
    $self->SUPER::new(@_);
    $self->{count} = 0;
    $self->{read_buf} = '';
    $self->{mogsvc} = Perlbal->service('mogstored');
    return $self;
}

sub event_read {
    my SideChannelClient $self = shift;

    my $bref = $self->read(1024);
    return $self->close unless defined $bref;
    $self->{read_buf} .= $$bref;

    my $path = $self->{mogsvc}->{docroot};

    while ($self->{read_buf} =~ s/^(.+?)\r?\n//) {
        my $cmd = $1;
        if ($cmd =~ /^size (\S+)$/) {
            # increase our count
            $self->{count}++;

            # validate uri
            my $uri = $1;
            if ($uri =~ /\.\./) {
                $self->write("ERROR: uri invalid (contains ..)\r\n");
                return;
            }

            # now stat the file to get the size and such
            Perlbal::AIO::aio_stat("$path$uri", sub {
                return if $self->{closed};
                my $size = -e _ ? -s _ : -1;
                $self->write("$uri $size\r\n");
            });
        } elsif ($cmd =~ /^watch$/i) {
            unless ($iostat_available) {
                $self->write("ERR iostat unavailable\r\n");
                next;
            }
            $self->watch_read(0);
            main::iostat_subscribe($self);
        } else {
            # we don't understand this so pass it on to manage command interface
            my @out;
            Perlbal::run_manage_command($cmd, sub { push @out, $_[0]; });
            $self->write(join("\r\n", @out) . "\r\n");
        }
    }
}

# override Danga::Socket's event handlers which die
sub event_err { $_[0]->close; }
sub event_hup { $_[0]->close; }

# as_string handler
sub as_string {
    my SideChannelClient $self = shift;

    my $ret = $self->SUPER::as_string;
    $ret .= "; size_requests=$self->{count}";

    return $ret;
}

sub close {
    my SideChannelClient $self = shift;
    main::iostat_unsubscribe($self);
    $self->SUPER::close;
}

# Local Variables:
# mode: perl
# c-basic-indent: 4
# indent-tabs-mode: nil
# End:

__END__

=head1 NAME

mogstored -- MogileFS storage daemon

=head1 USAGE

This is the MogileFS storage daemon, which is just an HTTP server that
supports PUT, DELETE, etc.  It's actually a wrapper around L<Perlbal>,
doing all the proper Perlbal config for you.

In addition, it monitors disk usage, I/O activity, etc, which are
checked from the L<MogileFS tracker|mogilefsd>.

=head1 AUTHORS

Brad Fitzpatrick E<lt>brad@danga.comE<gt>

Mark Smith E<lt>junior@danga.comE<gt>

Jonathan Steinert E<lt>jsteinert@sixapart.comE<gt>

=head1 COPYRIGHT

 Copyright 2004, Danga Interactive
 Copyright 2005-2006, Six Apart Ltd.

=head1 LICENSE

Same terms as Perl itself.  Artistic/GPLv2, at your choosing.

=head1 SEE ALSO

L<MogileFS::Overview> -- high level overview of MogileFS

L<mogilefsd> -- MogileFS daemon

L<http://danga.com/mogilefs/>
