/train.pl
#!/usr/bin/env perl
use v5.10;
use strict;
use warnings;

use DBI;

$|++;

my $dbh = DBI->connect("dbi:SQLite:dbname=markov.db", "", "");

my $token_id_h = $dbh->prepare('SELECT id FROM tokens WHERE token = ?');
my $id_token_h = $dbh->prepare('SELECT token FROM tokens WHERE id = ?');
my $insert_h = $dbh->prepare('INSERT OR IGNORE INTO tokens (token) VALUES (?)');
my $map_h = $dbh->prepare('INSERT INTO seq2 (t1, t2, next) VALUES (?, ?, ?)');

sub insert_token {
    my $token = shift;
    $token_id_h->execute($token);
    my $tid = $token_id_h->fetchrow_array;
    if (!defined $tid) {
        $insert_h->execute($token);
        $tid = $insert_h->last_insert_id();
    }
    return $tid;
}

my @tokens;

sub read_corpus {
    my $filename = shift;
    say "reading $filename";
    my $total = -s $filename;
    my $c = 0;
    open F, '<:utf8', $filename;
    while (<F>) {
        while (/\s+([^\s\w]*)([\w']*)([^\s\w]*)/g) {
            next unless $1 || $2 || $3;
            #say "|$1|$2|$3|";
            if ($1) {
                for my $sym (split(//, $3)) {
                    push @tokens, insert_token(">$sym");
                }
            }
            if ($2) {
                push @tokens, insert_token("|$2");
            }
            if ($3) {
                for my $sym (split(//, $3)) {
                    push @tokens, insert_token("<$sym");
                }
            }
        }
        if ($c++ == 100) {
            print tell(F), "/$total bytes\r";
            $c = 0;
        }
    }
    close F;
    say "";
}

sub map_corpus {
    say "mapping";
    $dbh->do('DELETE FROM seq2');
    my $c = 0;
    my @t_seq = @tokens[0..1];
    for my $i (2..$#tokens) {
        my $t = $tokens[$i];
        push @t_seq, $t;
        $map_h->execute(@t_seq);
        shift @t_seq;
        if ($c++ == 100) {
            print(($i + 1), "/", scalar @tokens, " tokens\r");
            $c = 0;
        }
    }
    say "";
}

sub get_token {
    my $tid = shift;
    $id_token_h->execute($tid);
    $id_token_h->fetchrow_array
}

if (!@ARGV) {
    say "Please specify text files";
    exit(1);
}

for my $input_file (@ARGV) {
    read_corpus($input_file);
}
map_corpus;
say "Done!";