blob: 5527e62b69f10bd0ef8dd08133740df3de200b17 [file] [log] [blame]
// Copyright 2013 The Flutter Authors. All rights reserved.
// Use of this source code is governed by a BSD-style license that can be
// found in the LICENSE file.
import 'dart:convert';
import 'package:html/dom.dart';
import 'package:html/parser.dart' as html_parser;
import 'closed_caption_file.dart';
/// Represents a [ClosedCaptionFile], parsed from the WebVTT file format.
/// See: https://en.wikipedia.org/wiki/WebVTT
class WebVTTCaptionFile extends ClosedCaptionFile {
/// Parses a string into a [ClosedCaptionFile], assuming [fileContents] is in
/// the WebVTT file format.
/// * See: https://en.wikipedia.org/wiki/WebVTT
WebVTTCaptionFile(String fileContents)
: _captions = _parseCaptionsFromWebVTTString(fileContents);
@override
List<Caption> get captions => _captions;
final List<Caption> _captions;
}
List<Caption> _parseCaptionsFromWebVTTString(String file) {
final List<Caption> captions = <Caption>[];
// Ignore metadata
final Set<String> metadata = <String>{'HEADER', 'NOTE', 'REGION', 'WEBVTT'};
int captionNumber = 1;
for (final List<String> captionLines in _readWebVTTFile(file)) {
// CaptionLines represent a complete caption.
// E.g
// [
// [00:00.000 --> 01:24.000 align:center]
// ['Introduction']
// ]
// If caption has just header or time, but no text, `captionLines.length` will be 1.
if (captionLines.length < 2) {
continue;
}
// If caption has header equal metadata, ignore.
final String metadaType = captionLines[0].split(' ')[0];
if (metadata.contains(metadaType)) {
continue;
}
// Caption has header
final bool hasHeader = captionLines.length > 2;
if (hasHeader) {
final int? tryParseCaptionNumber = int.tryParse(captionLines[0]);
if (tryParseCaptionNumber != null) {
captionNumber = tryParseCaptionNumber;
}
}
final _CaptionRange? captionRange = _CaptionRange.fromWebVTTString(
hasHeader ? captionLines[1] : captionLines[0],
);
if (captionRange == null) {
continue;
}
final String text = captionLines.sublist(hasHeader ? 2 : 1).join('\n');
// TODO(cyanglaz): Handle special syntax in VTT captions.
// https://github.com/flutter/flutter/issues/90007.
final String textWithoutFormat = _extractTextFromHtml(text);
final Caption newCaption = Caption(
number: captionNumber,
start: captionRange.start,
end: captionRange.end,
text: textWithoutFormat,
);
captions.add(newCaption);
captionNumber++;
}
return captions;
}
class _CaptionRange {
_CaptionRange(this.start, this.end);
final Duration start;
final Duration end;
// Assumes format from an VTT file.
// For example:
// 00:09.000 --> 00:11.000
static _CaptionRange? fromWebVTTString(String line) {
final RegExp format =
RegExp(_webVTTTimeStamp + _webVTTArrow + _webVTTTimeStamp);
if (!format.hasMatch(line)) {
return null;
}
final List<String> times = line.split(_webVTTArrow);
final Duration? start = _parseWebVTTTimestamp(times[0]);
final Duration? end = _parseWebVTTTimestamp(times[1]);
if (start == null || end == null) {
return null;
}
return _CaptionRange(start, end);
}
}
String _extractTextFromHtml(String htmlString) {
final Document document = html_parser.parse(htmlString);
final Element? body = document.body;
if (body == null) {
return '';
}
final Element? bodyElement = html_parser.parse(body.text).documentElement;
return bodyElement?.text ?? '';
}
// Parses a time stamp in an VTT file into a Duration.
//
// Returns `null` if `timestampString` is in an invalid format.
//
// For example:
//
// _parseWebVTTTimestamp('00:01:08.430')
// returns
// Duration(hours: 0, minutes: 1, seconds: 8, milliseconds: 430)
Duration? _parseWebVTTTimestamp(String timestampString) {
if (!RegExp(_webVTTTimeStamp).hasMatch(timestampString)) {
return null;
}
final List<String> dotSections = timestampString.split('.');
final List<String> timeComponents = dotSections[0].split(':');
// Validating and parsing the `timestampString`, invalid format will result this method
// to return `null`. See https://www.w3.org/TR/webvtt1/#webvtt-timestamp for valid
// WebVTT timestamp format.
if (timeComponents.length > 3 || timeComponents.length < 2) {
return null;
}
int hours = 0;
if (timeComponents.length == 3) {
final String hourString = timeComponents.removeAt(0);
if (hourString.length < 2) {
return null;
}
hours = int.parse(hourString);
}
final int minutes = int.parse(timeComponents.removeAt(0));
if (minutes < 0 || minutes > 59) {
return null;
}
final int seconds = int.parse(timeComponents.removeAt(0));
if (seconds < 0 || seconds > 59) {
return null;
}
final List<String> milisecondsStyles = dotSections[1].split(' ');
// TODO(cyanglaz): Handle caption styles.
// https://github.com/flutter/flutter/issues/90009.
// ```dart
// if (milisecondsStyles.length > 1) {
// List<String> styles = milisecondsStyles.sublist(1);
// }
// ```
// For a better readable code style, style parsing should happen before
// calling this method. See: https://github.com/flutter/plugins/pull/2878/files#r713381134.
final int milliseconds = int.parse(milisecondsStyles[0]);
return Duration(
hours: hours,
minutes: minutes,
seconds: seconds,
milliseconds: milliseconds,
);
}
// Reads on VTT file and splits it into Lists of strings where each list is one
// caption.
List<List<String>> _readWebVTTFile(String file) {
final List<String> lines = LineSplitter.split(file).toList();
final List<List<String>> captionStrings = <List<String>>[];
List<String> currentCaption = <String>[];
int lineIndex = 0;
for (final String line in lines) {
final bool isLineBlank = line.trim().isEmpty;
if (!isLineBlank) {
currentCaption.add(line);
}
if (isLineBlank || lineIndex == lines.length - 1) {
captionStrings.add(currentCaption);
currentCaption = <String>[];
}
lineIndex += 1;
}
return captionStrings;
}
const String _webVTTTimeStamp = r'(\d+):(\d{2})(:\d{2})?\.(\d{3})';
const String _webVTTArrow = r' --> ';