/train.pl
use v5.10;
use strict;
use warnings;
use DBI;
$|++;
my $dbh = DBI->connect("dbi:SQLite:dbname=markov.db", "", "");
my $token_id_h = $dbh->prepare('SELECT id FROM tokens WHERE token = ?');
my $id_token_h = $dbh->prepare('SELECT token FROM tokens WHERE id = ?');
my $insert_h = $dbh->prepare('INSERT OR IGNORE INTO tokens (token) VALUES (?)');
my $map_h = $dbh->prepare('INSERT INTO seq2 (t1, t2, next) VALUES (?, ?, ?)');
{
my $token = shift;
$token_id_h->execute($token);
my $tid = $token_id_h->fetchrow_array;
if (!defined $tid) {
$insert_h->execute($token);
$tid = $insert_h->last_insert_id();
}
return $tid;
}
my @tokens;
{
my $filename = shift;
say "reading $filename";
my $total = -s $filename;
my $c = 0;
open F, '<:utf8', $filename;
while (<F>) {
while (/\s+([^\s\w]*)([\w']*)([^\s\w]*)/g) {
next unless $1 || $2 || $3;
#say "|$1|$2|$3|";
if ($1) {
for my $sym (split(//, $3)) {
push @tokens, insert_token(">$sym");
}
}
if ($2) {
push @tokens, insert_token("|$2");
}
if ($3) {
for my $sym (split(//, $3)) {
push @tokens, insert_token("<$sym");
}
}
}
if ($c++ == 100) {
print tell(F), "/$total bytes\r";
$c = 0;
}
}
close F;
say "";
}
{
say "mapping";
$dbh->do('DELETE FROM seq2');
my $c = 0;
my @t_seq = @tokens[0..1];
for my $i (2..$#tokens) {
my $t = $tokens[$i];
push @t_seq, $t;
$map_h->execute(@t_seq);
shift @t_seq;
if ($c++ == 100) {
print(($i + 1), "/", scalar @tokens, " tokens\r");
$c = 0;
}
}
say "";
}
{
my $tid = shift;
$id_token_h->execute($tid);
$id_token_h->fetchrow_array
}
if (!@ARGV) {
say "Please specify text files";
exit(1);
}
for my $input_file (@ARGV) {
read_corpus($input_file);
}
map_corpus;
say "Done!";