Youtube – Downloading auto-generated YouTube transcriptions

subtitlesyoutube

Is there a way to download the automatically generated YouTube transcriptions without downloading the video?

I would like to check out the TED talks, but I have limited bandwidth and would like to export the automated transcripts (also possibly known as subtitles or closed captions).

Best Answer

Use "Network inspector" function of your browser's script debugger and find second request to timedtext page after enabling transcribed subtitles then just copy all that request to the addressbar to download them in native YouTube xml format.

To get SRT version run this code in the debugger console for that xml's page:

function makeTimeline (time) {
    var string, time_array = [], milliseconds = Math.round(time % 1 * 1000).toString();

    while (3 > milliseconds.length) {
        milliseconds = '0' + milliseconds;
    }

    time_array.push(Math.floor(time / (60 * 60)));
    time_array.push(Math.floor((time - (time_array[0] * 60 * 60)) / 60));
    time_array.push(Math.floor(time - ((time_array[1] * 60) + (time_array[0] * 60 * 60))));

    for (var i = 0, il = time_array.length; i < il; i++) {
        string = '' + time_array[i];
        if (1 === string.length) {
            time_array[i] = '0' + string;
        }
    }
    return time_array.join(':') + ',' + milliseconds;
};

function returnSRT (data) {
    var caption, previous_start, start, end, temp, captions = data.getElementsByTagName('text'), srt_output = '';

    for (var i = 0, il = captions.length; i < il; i++) {
        caption = captions[i];
        start = +caption.getAttribute('start');

        if (0 <= previous_start) {
            temp = captions[i - 1].textContent.replace(/</g, '&lt;').replace(/>/g, '&gt;');
            srt_output += i + '\n' + makeTimeline(previous_start) + ' --> ' + makeTimeline(start) + '\n' + temp + '\n\n';
            previous_start = -1;
        }

        if ( end = +caption.getAttribute('dur'))
            end = start + end;
        else {
            if (captions[i + 1]) {
                previous_start = start;
                continue;
            }
        }

        temp = caption.textContent.replace(/</g, '&lt;').replace(/>/g, '&gt;');
        srt_output += i + '\n' + makeTimeline(start) + ' --> ' + makeTimeline(end) + '\n' + temp + '\n\n';
    };
    return srt_output;
}

returnSRT(document.documentElement)

Also here is bookmarklet version of the script:

javascript:(function(){function%20makeTimeline(time)%7Bvar%20string%2Ctime_array%3D%5B%5D%2Cmilliseconds%3DMath.round(time%251*1000).toString()%3Bwhile(3%3Emilliseconds.length)%7Bmilliseconds%3D'0'%2Bmilliseconds%3B%7Dtime_array.push(Math.floor(time%2F(60*60)))%3Btime_array.push(Math.floor((time-(time_array%5B0%5D*60*60))%2F60))%3Btime_array.push(Math.floor(time-((time_array%5B1%5D*60)%2B(time_array%5B0%5D*60*60))))%3Bfor(var%20i%3D0%2Cil%3Dtime_array.length%3Bi%3Cil%3Bi%2B%2B)%7Bstring%3D''%2Btime_array%5Bi%5D%3Bif(1%3D%3D%3Dstring.length)%7Btime_array%5Bi%5D%3D'0'%2Bstring%3B%7D%7Dreturn%20time_array.join('%3A')%2B'%2C'%2Bmilliseconds%3B%7D%3Bfunction%20returnSRT(data)%7Bvar%20caption%2Cprevious_start%2Cstart%2Cend%2Ctemp%2Ccaptions%3Ddata.getElementsByTagName('text')%2Csrt_output%3D''%3Bfor(var%20i%3D0%2Cil%3Dcaptions.length%3Bi%3Cil%3Bi%2B%2B)%7Bcaption%3Dcaptions%5Bi%5D%3Bstart%3D%2Bcaption.getAttribute('start')%3Bif(0%3C%3Dprevious_start)%7Btemp%3Dcaptions%5Bi-1%5D.textContent.replace(%2F%3C%2Fg%2C'%26lt%3B').replace(%2F%3E%2Fg%2C'%26gt%3B')%3Bsrt_output%2B%3Di%2B'%5Cn'%2BmakeTimeline(previous_start)%2B'%20--%3E%20'%2BmakeTimeline(start)%2B'%5Cn'%2Btemp%2B'%5Cn%5Cn'%3Bprevious_start%3D-1%3B%7Dif(end%3D%2Bcaption.getAttribute('dur'))end%3Dstart%2Bend%3Belse%7Bif(captions%5Bi%2B1%5D)%7Bprevious_start%3Dstart%3Bcontinue%3B%7D%7Dtemp%3Dcaption.textContent.replace(%2F%3C%2Fg%2C'%26lt%3B').replace(%2F%3E%2Fg%2C'%26gt%3B')%3Bsrt_output%2B%3Di%2B'%5Cn'%2BmakeTimeline(start)%2B'%20--%3E%20'%2BmakeTimeline(end)%2B'%5Cn'%2Btemp%2B'%5Cn%5Cn'%3B%7D%3Breturn%20srt_output%3B%7Dwindow.location.href%3D'data%3Atext%2Fplain%3Bbase64%2C'%2Bbtoa(returnSRT(document.documentElement))})();