/youtube-subrip
#!/usr/bin/perl
use LWP::Simple;
use XML::Twig;
use strict;

my $url = shift;
$url =~ /v=([A-Za-z0-9_-]+)/;
my $id = $1 || $url;

my ($xml, $t, %langs, $lang_default, $name);

print STDERR "Fetching subtitle list...";

$xml = get("http://video.google.com/timedtext?hl=en&v=$id&type=list");

if (defined $xml) {
	print STDERR "OK\n";
} else {
	print STDERR "Failed\n";
	exit 1;
}

$t = XML::Twig->new(
	twig_roots => { 'track' => \&add_track }
);

$t->parse($xml);

sub add_track {
	my ($t, $elt) = @_;
	$langs{$elt->att('lang_code')} = $elt->att('name');
	if ($elt->att('lang_default') eq 'true') {
		$lang_default = $elt->att('lang_code');
	}
}

$name = $langs{$lang_default};

print STDERR "Fetching subtitles... ";

$xml = get("http://video.google.com/timedtext?hl=en&v=$id&type=track&name=$name&lang=en");

if (defined $xml) {
	print STDERR "OK\n";
} else {
	print STDERR "Failed\n";
	exit 1;
}

my $t = XML::Twig->new(
	twig_roots => { 'text' => \&convert_text }
);

$t->parse($xml);

my $sub_count = 0;

sub ent_decode {
	local $_ = shift;
	s/&#(\d+);/chr($1)/ge;
	s/"/"/g;
	s/&/&/g;
	s/'/'/g;
	s/&lt;/</g;
	s/&gt;/>/g;
	return $_;
}

sub timecode {
	my $n = shift;
	my $in = int($n);
	my $frac = ($n - $in) * 1000;
	my $s = $in % 60;
	my $m = int($in / 60) % 60;
	my $h = int($in / 3600);
	return sprintf("%02d:%02d:%02d,%03d", $h, $m, $s, $frac);
}

sub convert_text {
	my ($t, $elt) = @_;
	my $start = $elt->att('start');
	my $end = $elt->att('start') + $elt->att('dur');
	
	$sub_count++;

	print "$sub_count\n";
	print timecode($start), ' --> ', timecode($end), "\n";
	print ent_decode($elt->first_child_text);
	print "\n\n";

	$elt->purge();
}