/Vector/HtmlFilter.pm
package Vector::HtmlFilter;
use strict;

my %html = (
	#a => { href => 1 },
	b => {},
	i => {},
	u => {},
	#img => { src => 1, width => 1, height => 1, alt => 1 },
	#hr => {},
	#blockquote => {},
	#table => { width => 1, border => 1 },
	#tr => {},
	#td => { align => 1 },
	#abbr => {},
	#acronym => {},
	big => {},
	#br => {},
	#cite => {},
	code => {},
	#em => {},
	#ol => {},
	#ul => {},
	#li => {},
	#p => {},
	pre => {},
	s => {},
	small => {},
	'sub' => {},
	sup => {},
);

my $nuke = 'script';
my $oingoboingo = 'applet|embed|object';

sub htmlfilter($) {
	my $str = shift;
	my $c = 0;

	# Nuke dangerous tags
	$$str =~ s#<($nuke)[^>]*>.*?</\1>#[$1 tag nuked]#gs;

	# And these people love little girls
	$$str =~ s#<($oingoboingo)[^>]*>.*?</\1>#<p><a href="http://www.youtube.com/watch?v=jItz-uNjoZA">I love little girls</a>.#gs;

	# Filter problematic tags and attributes
	while ($$str =~ m'\G[^<]*(<(/?)(\w+)\s*([^>]*)\s*/?>)'gs) {
		my $suspect = $1;
		my $end = $2;
		my $tag = $3;
		my @attrs = split(/\s+/, $4);
		if (defined $html{$tag}) {
			my @oattrs = ();
			for my $a (@attrs) {
				my ($atag, $value) = split('=', $a);
				next if ($atag eq 'href' && $value =~ /["']?javascript:/);
				if ($html{$tag}->{$atag}) {
					push @oattrs, $a;
				}
			}
			my $rtag = "<$end$tag" . (@oattrs ? ' ' : '') . join(' ', @oattrs) . '>';
			my $newpos = pos($$str) - length($suspect) + length($rtag);
			substr($$str, $-[1], $+[1] - $-[1], $rtag);
			# Reset pos() so \G starts from the right place
			pos($$str) = $newpos;
			die "Too many tags" if ++$c == 10000;
		} else {
			substr($$str, $-[1], $+[1] - $-[1], '');
		}
	}
}

sub textify {
	my $str = shift;

	$$str =~ s'</?b[^>]*>'*'g;
	$$str =~ s'</?i[^>]*>'/'g;
	$$str =~ s'</?u[^>]*>'_'g;

	$$str =~ s'<[^>]*>''g;
}

1;