mirror of
https://github.com/ytdl-org/youtube-dl.git
synced 2024-11-22 18:22:21 +00:00
[utils] fix dfxp2srt text extraction(fixes #8055)
This commit is contained in:
parent
ed7cd1e859
commit
2b14cb566f
1 changed files with 20 additions and 13 deletions
|
@ -2017,20 +2017,27 @@ def dfxp2srt(dfxp_data):
|
||||||
'ttaf1': 'http://www.w3.org/2006/10/ttaf1',
|
'ttaf1': 'http://www.w3.org/2006/10/ttaf1',
|
||||||
})
|
})
|
||||||
|
|
||||||
|
class TTMLPElementParser:
|
||||||
|
out = ''
|
||||||
|
|
||||||
|
def start(self, tag, attrib):
|
||||||
|
if tag in (_x('ttml:br'), _x('ttaf1:br'), 'br'):
|
||||||
|
self.out += '\n'
|
||||||
|
|
||||||
|
def end(self, tag):
|
||||||
|
pass
|
||||||
|
|
||||||
|
def data(self, data):
|
||||||
|
self.out += data
|
||||||
|
|
||||||
|
def close(self):
|
||||||
|
return self.out.strip()
|
||||||
|
|
||||||
def parse_node(node):
|
def parse_node(node):
|
||||||
str_or_empty = functools.partial(str_or_none, default='')
|
target = TTMLPElementParser()
|
||||||
|
parser = xml.etree.ElementTree.XMLParser(target=target)
|
||||||
out = str_or_empty(node.text)
|
parser.feed(xml.etree.ElementTree.tostring(node))
|
||||||
|
return parser.close()
|
||||||
for child in node:
|
|
||||||
if child.tag in (_x('ttml:br'), _x('ttaf1:br'), 'br'):
|
|
||||||
out += '\n' + str_or_empty(child.tail)
|
|
||||||
elif child.tag in (_x('ttml:span'), _x('ttaf1:span'), 'span'):
|
|
||||||
out += str_or_empty(parse_node(child))
|
|
||||||
else:
|
|
||||||
out += str_or_empty(xml.etree.ElementTree.tostring(child))
|
|
||||||
|
|
||||||
return out
|
|
||||||
|
|
||||||
dfxp = compat_etree_fromstring(dfxp_data.encode('utf-8'))
|
dfxp = compat_etree_fromstring(dfxp_data.encode('utf-8'))
|
||||||
out = []
|
out = []
|
||||||
|
|
Loading…
Reference in a new issue