/Vector/HtmlFilter.pm
use strict;
my %html = (
#a => { href => 1 },
b => {},
i => {},
u => {},
#img => { src => 1, width => 1, height => 1, alt => 1 },
#hr => {},
#blockquote => {},
#table => { width => 1, border => 1 },
#tr => {},
#td => { align => 1 },
#abbr => {},
#acronym => {},
big => {},
#br => {},
#cite => {},
code => {},
#em => {},
#ol => {},
#ul => {},
#li => {},
#p => {},
pre => {},
s => {},
small => {},
'sub' => {},
sup => {},
);
my $nuke = 'script';
my $oingoboingo = 'applet|embed|object';
{
my $str = shift;
my $c = 0;
# Nuke dangerous tags
$$str =~ s#<($nuke)[^>]*>.*?</\1>#[$1 tag nuked]#gs;
# And these people love little girls
$$str =~ s#<($oingoboingo)[^>]*>.*?</\1>#<p><a href="http://www.youtube.com/watch?v=jItz-uNjoZA">I love little girls</a>.#gs;
# Filter problematic tags and attributes
while ($$str =~ m'\G[^<]*(<(/?)(\w+)\s*([^>]*)\s*/?>)'gs) {
my $suspect = $1;
my $end = $2;
my $tag = $3;
my @attrs = split(/\s+/, $4);
if (defined $html{$tag}) {
my @oattrs = ();
for my $a (@attrs) {
my ($atag, $value) = split('=', $a);
next if ($atag eq 'href' && $value =~ /["']?javascript:/);
if ($html{$tag}->{$atag}) {
push @oattrs, $a;
}
}
my $rtag = "<$end$tag" . (@oattrs ? ' ' : '') . join(' ', @oattrs) . '>';
my $newpos = pos($$str) - length($suspect) + length($rtag);
substr($$str, $-[1], $+[1] - $-[1], $rtag);
# Reset pos() so \G starts from the right place
pos($$str) = $newpos;
die "Too many tags" if ++$c == 10000;
} else {
substr($$str, $-[1], $+[1] - $-[1], '');
}
}
}
{
my $str = shift;
$$str =~ s'</?b[^>]*>'*'g;
$$str =~ s'</?i[^>]*>'/'g;
$$str =~ s'</?u[^>]*>'_'g;
$$str =~ s'<[^>]*>''g;
}
1;