Take off and nuke 'em from orbit, it's the only way to be sure.
+package Dreamlands::Dreamer;
+use v5.10;
+use strict;
+use warnings;
+
+use Exporter 'import';
+use DBI;
+use Digest::SHA qw/sha1/;
+
+our @EXPORT_OK = qw(seed generate_paragraph);
+
+my $dbh = DBI->connect("dbi:SQLite:dbname=markov.db", "", "");
+
+my $id_token_h = $dbh->prepare('SELECT token FROM tokens WHERE id = ?');
+
+sub seed {
+ my $text = shift;
+ my $bits = sha1($text);
+ my $n = unpack("L", $bits);
+ srand($n);
+}
+
+sub get_token {
+ my $tid = shift;
+ $id_token_h->execute($tid);
+ $id_token_h->fetchrow_array
+}
+
+sub random_select {
+ $_[int(rand(@_))]
+}
+
+sub generate_paragraph {
+ my %opts = @_;
+ if (!defined $opts{sentences}) {
+ $opts{sentences} = [2, 5];
+ }
+ if (!defined $opts{max_tokens}) {
+ $opts{max_tokens} = 500;
+ }
+
+ my @words_out;
+ my $glom_next = '';
+ my @t;
+ my $next_token_h = $dbh->prepare('SELECT next FROM seq2 WHERE t1=? AND t2=?');
+
+ my $crank = sub {
+ $next_token_h->execute(@t);
+ my $next = random_select(@{$next_token_h->fetchall_arrayref})->[0];
+ if (!defined $next) {
+ $next = random_select(
+ $dbh->selectall_array('SELECT next FROM seq2 WHERE t2=?', undef, $t[1])
+ )->[0];
+ if (!defined $next) {
+ # uhhhh, punt
+ @t = $dbh->selectrow_array('SELECT t1, t2, next FROM seq2 ORDER BY RANDOM() LIMIT 1');
+ $next = shift @t;
+ }
+ }
+ my $token = get_token($next);
+ push @t, $next;
+ shift @t;
+
+ my $chomp = substr($token, 0, 1);
+ if ($chomp eq '>') {
+ $glom_next .= substr($token, 1);
+ } elsif ($chomp eq '<') {
+ if (@words_out == 0) {
+ push @words_out, substr($token, 1);
+ } else {
+ $words_out[$#words_out] .= substr($token, 1);
+ }
+ $glom_next = '';
+ } elsif ($chomp eq '|') {
+ push @words_out, substr($token, 1);
+ $glom_next = '';
+ }
+ return $token;
+ };
+
+ # So first we need to bootstrap a couple of plausible words. We start by
+ # getting the token id for a terminal period.
+ @t = $dbh->selectrow_array('SELECT id FROM tokens WHERE token = "<."');
+ # Then seed our token sequence with it and a word that precedes it.
+ @t = @{random_select(
+ $dbh->selectall_array('SELECT t1, t2 FROM seq2 WHERE t2=?', undef, @t)
+ )};
+
+ my $target_sentences = int(rand($opts{sentences}->[1] - $opts{sentences}->[0])) + $opts{sentences}->[0];
+ my $sentences = 0;
+ my $token_count = 0;
+ while ($token_count < $opts{max_tokens}) {
+ my $token = &$crank;
+ $token_count++;
+ if ($token eq '<.' || $token eq '<!' || $token eq '<?') {
+ $sentences++;
+ last if $sentences == $target_sentences;
+ }
+ }
+
+ return @words_out;
+}
+
+1;
+# The Dreamlands
+
+The Dreamlands is a (probably) infinite web of text and links, meant to
+ensnare unwitting visitors in a never ending dream. It produces text
+from a markov generator with a two-token lookup. The markov data is
+stored in a local sqlite database, which means the memory usage is
+minimal at the cost of some CPU and I/O.
+
+## Setup
+
+The Dreamlands installs all of its dependencies locally, needing only a
+functioning Perl 5 (for values of 5 greater than 5.10) environment and
+[`cpanm`](https://metacpan.org/dist/App-cpanminus/view/bin/cpanm) (most
+likely available in your package manager as some variation of
+`cpanminus`).
+
+First, set up the environment.
+
+```
+$ . ./env.sh # Note the leading dot, it's important
+```
+
+Then, install dependencies and initialize the database.
+
+```
+$ ./init.sh
+```
+
+### Markov Training
+
+The markov generator must be trained before The Dreamlands can run. You
+will need UTF-8 text files to ingest into the markov generator. Give
+them to `./train.pl`.
+
+```
+$ ./train.pl a_novel.txt
+```
+
+This will take some time as it tokenizes the text and calculates the
+token relationships. The process can be restarted; already inserted
+tokens will be skipped. But the process is not incremental. The
+token relationships are always erased and rebuilt every time it is
+trained. If you want to change the text source, you should remove
+`markov.db`, re-run `./init.sh`, and re-train.
+
+## Running
+
+If you want to test that the markov generator is working properly, you
+can run `./generate.pl`. It will output one paragraph of text by
+default, or you can give it a number to output that many paragraphs.
+
+The web server runs from `./http.pl`, and by default starts on port
+8080. It takes no arguments, but you can change the port and path prefix
+by editing the script.
+
+## The Nitty Gritty
+
+The obvious way to use this is as a trap for badly behaved web crawlers.
+I recommend you add the path to `/robots.txt`, so that well behaved
+crawlers avoid it. Then any crawlers that do get stuck in it are
+obviously badly configured or hostile. You might use their presence as a
+source for a blocklist using e.g. [fail2ban](http://www.fail2ban.org).
+Or you might simply wish to give them a hard time by letting them go
+'round and 'round until they fall over.
+
+Each page randomly selects tokens seeded by a hash of the path, so each
+page's content should be stable. Links are randomly placed within the
+text, linking to other pages ad infinitum.
+
+### Templating
+
+Pages are rendered with
+[Template::Toolkit](https://template-toolkit.org/). The template is in
+`templates/page.tt`.
+eval $(perl -Mlocal::lib=./perl-lib)
+export PERL5LIB=$PERL5LIB:$PWD
+#!/usr/bin/env perl
+use v5.10;
+use strict;
+use warnings;
+
+use DBI;
+use Dreamlands::Dreamer qw/generate_paragraph/;
+
+my $paragraphs = shift;
+$paragraphs = 1 unless defined $paragraphs;
+
+for (1..$paragraphs) {
+ my @words = generate_paragraph();
+ say "@words";
+ say "" if $_ < $paragraphs;
+}
+#!/usr/bin/env perl
+use v5.10;
+use strict;
+use warnings;
+
+my $PORT = 8080;
+my $ROOT = '/';
+
+{
+ package Dreamlands::WebServer;
+ use HTTP::Server::Simple::CGI;
+ use base qw(HTTP::Server::Simple::CGI);
+
+ use FindBin qw/$Bin/;
+ use Template;
+ use Dreamlands::Dreamer qw/generate_paragraph/;
+
+ my $tt = Template->new({
+ INCLUDE_PATH => "$Bin/templates",
+ });
+
+ sub linkify {
+ my $links = int(rand(3));
+ my $n;
+ my @ranges;
+ for (1..$links) {
+ my $start = int(rand(@_));
+ my $end = $start + int(rand(3));
+ if ($end > @_ - 1) {
+ $end = @_ - 1;
+ }
+ for my $r (@ranges) {
+ if (($start >= $r->[0] && $start <= $r->[1]) || ($end >= $r->[0] && $end <= $r->[1])) {
+ # Range overlap; try again
+ next;
+ }
+ }
+ push @ranges, [$start, $end];
+ }
+ for my $r (@ranges) {
+ my $slug = join('-', map {
+ s/[^\w]//g; lc $_
+ } @_[$r->[0]..$r->[1]]);
+ $_[$r->[0]] = qq{<a href="$ROOT$slug">} . $_[$r->[0]];
+ $_[$r->[1]] .= '</a>';
+ }
+ @_
+ }
+
+ sub handle_request {
+ my ($self, $cgi) = @_;
+
+ my $path = $cgi->path_info();
+
+ print "HTTP/1.0 200 OK\r\n";
+ print $cgi->header(-charset => 'utf-8');
+
+ Dreamlands::Dreamer::seed($path);
+ my $para_count = int(rand(3)) + 2;
+ my $content = '';
+ for (1..$para_count) {
+ $content .= join(' ', linkify(generate_paragraph));
+ $content .= "\n\n" unless $_ == $para_count;
+ }
+
+ $tt->process('page.tt', {
+ content => $content,
+ path => $path,
+ });
+ }
+}
+
+my $server = Dreamlands::WebServer->new($PORT)->run;
+#!/bin/sh
+
+cpanm DBD::SQLite HTTP::Server::Simple
+
+echo Initializing markov.db
+sqlite3 markov.db <schema.sql
+CREATE TABLE IF NOT EXISTS tokens (
+ id INTEGER PRIMARY KEY AUTOINCREMENT,
+ token VARCHAR,
+ CONSTRAINT unique_token UNIQUE (token) ON CONFLICT IGNORE
+);
+
+CREATE TABLE IF NOT EXISTS seq2 (
+ t1 INTEGER,
+ t2 INTEGER,
+ next INTEGER
+);
+
+DROP INDEX IF EXISTS seq2_t1_t2;
+CREATE INDEX seq2_t1_t2 ON seq2 (t1, t2);
+<!DOCTYPE html>
+<h1>
+[% IF path == '/' %]
+Welcome!
+[% ELSE %]
+[% path | replace('^/', '') | replace('-', ' ') | ucfirst %]
+[% END %]
+</h1>
+
+[% FILTER html_para %]
+[% content %]
+[% END %]
+#!/usr/bin/env perl
+use v5.10;
+use strict;
+use warnings;
+
+use DBI;
+
+$|++;
+
+my $dbh = DBI->connect("dbi:SQLite:dbname=markov.db", "", "");
+
+my $token_id_h = $dbh->prepare('SELECT id FROM tokens WHERE token = ?');
+my $id_token_h = $dbh->prepare('SELECT token FROM tokens WHERE id = ?');
+my $insert_h = $dbh->prepare('INSERT OR IGNORE INTO tokens (token) VALUES (?)');
+my $map_h = $dbh->prepare('INSERT INTO seq2 (t1, t2, next) VALUES (?, ?, ?)');
+
+sub insert_token {
+ my $token = shift;
+ $token_id_h->execute($token);
+ my $tid = $token_id_h->fetchrow_array;
+ if (!defined $tid) {
+ $insert_h->execute($token);
+ $tid = $insert_h->last_insert_id();
+ }
+ return $tid;
+}
+
+my @tokens;
+
+sub read_corpus {
+ my $filename = shift;
+ say "reading $filename";
+ my $total = -s $filename;
+ my $c = 0;
+ open F, '<:utf8', $filename;
+ while (<F>) {
+ while (/\s+([^\s\w]*)([\w']*)([^\s\w]*)/g) {
+ next unless $1 || $2 || $3;
+ #say "|$1|$2|$3|";
+ if ($1) {
+ for my $sym (split(//, $3)) {
+ push @tokens, insert_token(">$sym");
+ }
+ }
+ if ($2) {
+ push @tokens, insert_token("|$2");
+ }
+ if ($3) {
+ for my $sym (split(//, $3)) {
+ push @tokens, insert_token("<$sym");
+ }
+ }
+ }
+ if ($c++ == 100) {
+ print tell(F), "/$total bytes\r";
+ $c = 0;
+ }
+ }
+ close F;
+ say "";
+}
+
+sub map_corpus {
+ say "mapping";
+ $dbh->do('DELETE FROM seq2');
+ my $c = 0;
+ my @t_seq = @tokens[0..1];
+ for my $i (2..$#tokens) {
+ my $t = $tokens[$i];
+ push @t_seq, $t;
+ $map_h->execute(@t_seq);
+ shift @t_seq;
+ if ($c++ == 100) {
+ print(($i + 1), "/", scalar @tokens, " tokens\r");
+ $c = 0;
+ }
+ }
+ say "";
+}
+
+sub get_token {
+ my $tid = shift;
+ $id_token_h->execute($tid);
+ $id_token_h->fetchrow_array
+}
+
+if (!@ARGV) {
+ say "Please specify text files";
+ exit(1);
+}
+
+for my $input_file (@ARGV) {
+ read_corpus($input_file);
+}
+map_corpus;
+say "Done!";