git.bytex64.net -- dreamlands

commit:	937d4b4fa57bbe043fdc46da32acc9ce80f6d4f1
author:	Chip
committer:	Chip
date:	Wed Jan 29 00:28:34 2025 -0600
parents:

Take off and nuke 'em from orbit, it's the only way to be sure.

diff --git a/.gitignore b/.gitignore
line changes: +2/-0
index 0000000..40b1b04
--- /dev/null
+++ b/.gitignore

@@ -0,0 +1,2 @@
+markov.db
+perl-lib/

diff --git a/Dreamlands/Dreamer.pm b/Dreamlands/Dreamer.pm
line changes: +104/-0
index 0000000..2a21aa7
--- /dev/null
+++ b/Dreamlands/Dreamer.pm

@@ -0,0 +1,104 @@
+package Dreamlands::Dreamer;
+use v5.10;
+use strict;
+use warnings;
+
+use Exporter 'import';
+use DBI;
+use Digest::SHA qw/sha1/;
+
+our @EXPORT_OK = qw(seed generate_paragraph);
+
+my $dbh = DBI->connect("dbi:SQLite:dbname=markov.db", "", "");
+
+my $id_token_h = $dbh->prepare('SELECT token FROM tokens WHERE id = ?');
+
+sub seed {
+    my $text = shift;
+    my $bits = sha1($text);
+    my $n = unpack("L", $bits);
+    srand($n);
+}
+
+sub get_token {
+    my $tid = shift;
+    $id_token_h->execute($tid);
+    $id_token_h->fetchrow_array
+}
+
+sub random_select {
+    $_[int(rand(@_))]
+}
+
+sub generate_paragraph {
+    my %opts = @_;
+    if (!defined $opts{sentences}) {
+        $opts{sentences} = [2, 5];
+    }
+    if (!defined $opts{max_tokens}) {
+        $opts{max_tokens} = 500;
+    }
+
+    my @words_out;
+    my $glom_next = '';
+    my @t;
+    my $next_token_h = $dbh->prepare('SELECT next FROM seq2 WHERE t1=? AND t2=?');
+
+    my $crank = sub {
+        $next_token_h->execute(@t);
+        my $next = random_select(@{$next_token_h->fetchall_arrayref})->[0];
+        if (!defined $next) {
+            $next = random_select(
+                $dbh->selectall_array('SELECT next FROM seq2 WHERE t2=?', undef, $t[1])
+            )->[0];
+            if (!defined $next) {
+                # uhhhh, punt
+                @t = $dbh->selectrow_array('SELECT t1, t2, next FROM seq2 ORDER BY RANDOM() LIMIT 1');
+                $next = shift @t;
+            }
+        }
+        my $token = get_token($next);
+        push @t, $next;
+        shift @t;
+
+        my $chomp = substr($token, 0, 1);
+        if ($chomp eq '>') {
+            $glom_next .= substr($token, 1);
+        } elsif ($chomp eq '<') {
+            if (@words_out == 0) {
+                push @words_out, substr($token, 1);
+            } else {
+                $words_out[$#words_out] .= substr($token, 1);
+            }
+            $glom_next = '';
+        } elsif ($chomp eq '|') {
+            push @words_out, substr($token, 1);
+            $glom_next = '';
+        }
+        return $token;
+    };
+
+    # So first we need to bootstrap a couple of plausible words. We start by
+    # getting the token id for a terminal period.
+    @t = $dbh->selectrow_array('SELECT id FROM tokens WHERE token = "<."');
+    # Then seed our token sequence with it and a word that precedes it.
+    @t = @{random_select(
+        $dbh->selectall_array('SELECT t1, t2 FROM seq2 WHERE t2=?', undef, @t)
+    )};
+
+    my $target_sentences = int(rand($opts{sentences}->[1] - $opts{sentences}->[0])) + $opts{sentences}->[0];
+    my $sentences = 0;
+    my $token_count = 0;
+    while ($token_count < $opts{max_tokens}) {
+        my $token = &$crank;
+        $token_count++;
+        if ($token eq '<.' || $token eq '<!' || $token eq '<?') {
+            $sentences++;
+            last if $sentences == $target_sentences;
+        }
+    }
+
+    return @words_out;
+}
+
+1;

diff --git a/README.md b/README.md
line changes: +74/-0
index 0000000..da40943
--- /dev/null
+++ b/README.md

@@ -0,0 +1,74 @@
+# The Dreamlands
+
+The Dreamlands is a (probably) infinite web of text and links, meant to
+ensnare unwitting visitors in a never ending dream. It produces text
+from a markov generator with a two-token lookup. The markov data is
+stored in a local sqlite database, which means the memory usage is
+minimal at the cost of some CPU and I/O.
+
+## Setup
+
+The Dreamlands installs all of its dependencies locally, needing only a
+functioning Perl 5 (for values of 5 greater than 5.10) environment and
+[`cpanm`](https://metacpan.org/dist/App-cpanminus/view/bin/cpanm) (most
+likely available in your package manager as some variation of
+`cpanminus`).
+
+First, set up the environment.
+
+```
+$ . ./env.sh  # Note the leading dot, it's important
+```
+
+Then, install dependencies and initialize the database.
+
+```
+$ ./init.sh
+```
+
+### Markov Training
+
+The markov generator must be trained before The Dreamlands can run. You
+will need UTF-8 text files to ingest into the markov generator. Give
+them to `./train.pl`.
+
+```
+$ ./train.pl a_novel.txt
+```
+
+This will take some time as it tokenizes the text and calculates the
+token relationships. The process can be restarted; already inserted
+tokens will be skipped. But the process is not incremental. The
+token relationships are always erased and rebuilt every time it is
+trained. If you want to change the text source, you should remove
+`markov.db`, re-run `./init.sh`, and re-train.
+
+## Running
+
+If you want to test that the markov generator is working properly, you
+can run `./generate.pl`. It will output one paragraph of text by
+default, or you can give it a number to output that many paragraphs.
+
+The web server runs from `./http.pl`, and by default starts on port
+8080. It takes no arguments, but you can change the port and path prefix
+by editing the script.
+
+## The Nitty Gritty
+
+The obvious way to use this is as a trap for badly behaved web crawlers.
+I recommend you add the path to `/robots.txt`, so that well behaved
+crawlers avoid it. Then any crawlers that do get stuck in it are
+obviously badly configured or hostile. You might use their presence as a
+source for a blocklist using e.g. [fail2ban](http://www.fail2ban.org).
+Or you might simply wish to give them a hard time by letting them go
+'round and 'round until they fall over.
+
+Each page randomly selects tokens seeded by a hash of the path, so each
+page's content should be stable. Links are randomly placed within the
+text, linking to other pages ad infinitum.
+
+### Templating
+
+Pages are rendered with
+[Template::Toolkit](https://template-toolkit.org/). The template is in
+`templates/page.tt`.

diff --git a/env.sh b/env.sh
line changes: +2/-0
index 0000000..aaa7bd8
--- /dev/null
+++ b/env.sh

@@ -0,0 +1,2 @@
+eval $(perl -Mlocal::lib=./perl-lib)
+export PERL5LIB=$PERL5LIB:$PWD

diff --git a/generate.pl b/generate.pl
line changes: +16/-0
index 0000000..6a3f108
--- /dev/null
+++ b/generate.pl

@@ -0,0 +1,16 @@
+#!/usr/bin/env perl
+use v5.10;
+use strict;
+use warnings;
+
+use DBI;
+use Dreamlands::Dreamer qw/generate_paragraph/;
+
+my $paragraphs = shift;
+$paragraphs = 1 unless defined $paragraphs;
+
+for (1..$paragraphs) {
+    my @words = generate_paragraph();
+    say "@words";
+    say "" if $_ < $paragraphs;
+}

diff --git a/http.pl b/http.pl
line changes: +73/-0
index 0000000..b7a7494
--- /dev/null
+++ b/http.pl

@@ -0,0 +1,73 @@
+#!/usr/bin/env perl
+use v5.10;
+use strict;
+use warnings;
+
+my $PORT = 8080;
+my $ROOT = '/';
+
+{
+    package Dreamlands::WebServer;
+    use HTTP::Server::Simple::CGI;
+    use base qw(HTTP::Server::Simple::CGI);
+
+    use FindBin qw/$Bin/;
+    use Template;
+    use Dreamlands::Dreamer qw/generate_paragraph/;
+
+    my $tt = Template->new({
+        INCLUDE_PATH => "$Bin/templates",
+    });
+
+    sub linkify {
+        my $links = int(rand(3));
+        my $n;
+        my @ranges;
+        for (1..$links) {
+            my $start = int(rand(@_));
+            my $end = $start + int(rand(3));
+            if ($end > @_ - 1) {
+                $end = @_ - 1;
+            }
+            for my $r (@ranges) {
+                if (($start >= $r->[0] && $start <= $r->[1]) || ($end >= $r->[0] && $end <= $r->[1])) {
+                    # Range overlap; try again
+                    next;
+                }
+            }
+            push @ranges, [$start, $end];
+        }
+        for my $r (@ranges) {
+            my $slug = join('-', map {
+                s/[^\w]//g; lc $_
+            } @_[$r->[0]..$r->[1]]);
+            $_[$r->[0]] = qq{<a href="$ROOT$slug">} . $_[$r->[0]];
+            $_[$r->[1]] .= '</a>';
+        }
+        @_
+    }
+
+    sub handle_request {
+        my ($self, $cgi) = @_;
+
+        my $path = $cgi->path_info();
+        
+        print "HTTP/1.0 200 OK\r\n";
+        print $cgi->header(-charset => 'utf-8');
+
+        Dreamlands::Dreamer::seed($path);
+        my $para_count = int(rand(3)) + 2;
+        my $content = '';
+        for (1..$para_count) {
+            $content .= join(' ', linkify(generate_paragraph));
+            $content .= "\n\n" unless $_ == $para_count;
+        }
+
+        $tt->process('page.tt', {
+            content => $content,
+            path => $path,
+        });
+    }
+}
+
+my $server = Dreamlands::WebServer->new($PORT)->run;

diff --git a/init.sh b/init.sh
line changes: +6/-0
index 0000000..6d889a7
--- /dev/null
+++ b/init.sh

@@ -0,0 +1,6 @@
+#!/bin/sh
+
+cpanm DBD::SQLite HTTP::Server::Simple
+
+echo Initializing markov.db
+sqlite3 markov.db <schema.sql

diff --git a/schema.sql b/schema.sql
line changes: +14/-0
index 0000000..149dc70
--- /dev/null
+++ b/schema.sql

@@ -0,0 +1,14 @@
+CREATE TABLE IF NOT EXISTS tokens (
+    id INTEGER PRIMARY KEY AUTOINCREMENT,
+    token VARCHAR,
+    CONSTRAINT unique_token UNIQUE (token) ON CONFLICT IGNORE
+);
+
+CREATE TABLE IF NOT EXISTS seq2 (
+    t1 INTEGER,
+    t2 INTEGER,
+    next INTEGER
+);
+
+DROP INDEX IF EXISTS seq2_t1_t2;
+CREATE INDEX seq2_t1_t2 ON seq2 (t1, t2);

diff --git a/templates/page.tt b/templates/page.tt
line changes: +12/-0
index 0000000..19205c7
--- /dev/null
+++ b/templates/page.tt

@@ -0,0 +1,12 @@
+<!DOCTYPE html>
+<h1>
+[% IF path == '/' %]
+Welcome!
+[% ELSE %]
+[% path | replace('^/', '') | replace('-', ' ') | ucfirst %]
+[% END %]
+</h1>
+
+[% FILTER html_para %]
+[% content %]
+[% END %]

diff --git a/train.pl b/train.pl
line changes: +96/-0
index 0000000..e42ac1e
--- /dev/null
+++ b/train.pl

@@ -0,0 +1,96 @@
+#!/usr/bin/env perl
+use v5.10;
+use strict;
+use warnings;
+
+use DBI;
+
+$|++;
+
+my $dbh = DBI->connect("dbi:SQLite:dbname=markov.db", "", "");
+
+my $token_id_h = $dbh->prepare('SELECT id FROM tokens WHERE token = ?');
+my $id_token_h = $dbh->prepare('SELECT token FROM tokens WHERE id = ?');
+my $insert_h = $dbh->prepare('INSERT OR IGNORE INTO tokens (token) VALUES (?)');
+my $map_h = $dbh->prepare('INSERT INTO seq2 (t1, t2, next) VALUES (?, ?, ?)');
+
+sub insert_token {
+    my $token = shift;
+    $token_id_h->execute($token);
+    my $tid = $token_id_h->fetchrow_array;
+    if (!defined $tid) {
+        $insert_h->execute($token);
+        $tid = $insert_h->last_insert_id();
+    }
+    return $tid;
+}
+
+my @tokens;
+
+sub read_corpus {
+    my $filename = shift;
+    say "reading $filename";
+    my $total = -s $filename;
+    my $c = 0;
+    open F, '<:utf8', $filename;
+    while (<F>) {
+        while (/\s+([^\s\w]*)([\w']*)([^\s\w]*)/g) {
+            next unless $1 || $2 || $3;
+            #say "|$1|$2|$3|";
+            if ($1) {
+                for my $sym (split(//, $3)) {
+                    push @tokens, insert_token(">$sym");
+                }
+            }
+            if ($2) {
+                push @tokens, insert_token("|$2");
+            }
+            if ($3) {
+                for my $sym (split(//, $3)) {
+                    push @tokens, insert_token("<$sym");
+                }
+            }
+        }
+        if ($c++ == 100) {
+            print tell(F), "/$total bytes\r";
+            $c = 0;
+        }
+    }
+    close F;
+    say "";
+}
+
+sub map_corpus {
+    say "mapping";
+    $dbh->do('DELETE FROM seq2');
+    my $c = 0;
+    my @t_seq = @tokens[0..1];
+    for my $i (2..$#tokens) {
+        my $t = $tokens[$i];
+        push @t_seq, $t;
+        $map_h->execute(@t_seq);
+        shift @t_seq;
+        if ($c++ == 100) {
+            print(($i + 1), "/", scalar @tokens, " tokens\r");
+            $c = 0;
+        }
+    }
+    say "";
+}
+
+sub get_token {
+    my $tid = shift;
+    $id_token_h->execute($tid);
+    $id_token_h->fetchrow_array
+}
+
+if (!@ARGV) {
+    say "Please specify text files";
+    exit(1);
+}
+
+for my $input_file (@ARGV) {
+    read_corpus($input_file);
+}
+map_corpus;
+say "Done!";