8755927956

Committed 19 Apr 2024 03:21PM UTC coverage: 70.223%. First build

Build # 8755927956

Build Type

Pull #1085

github

Committed by

web-flow

Commit Message

[DONE] Update settings.py (#1115)

bump pod version to 3.6.0

Pull Request Pull Request #1085: [DONE - FREEZE] Develop #3.6.0

Run Details

744 of 994 new or added lines in 37 files covered. (74.85%)

10530 of 14995 relevant lines covered (70.22%)

0.7 hits per line

Source File
Press 'n' to go to next uncovered line, 'b' for previous

30.0

/pod/video_encode_transcript/transcript.py

"""Esup-Pod transcript video functions."""

from django.conf import settings
from django.core.files import File
from pod.completion.models import Track
from pod.main.tasks import task_start_transcript
from webvtt import Caption

from .utils import (
    send_email,
    send_email_transcript,
    change_encoding_step,
    add_encoding_log,
)
from ..video.models import Video
import importlib.util

if (
    importlib.util.find_spec("vosk") is not None
    or importlib.util.find_spec("stt") is not None
    or importlib.util.find_spec("whisper") is not None
):
    from .transcript_model import start_transcripting


from .encoding_utils import sec_to_timestamp

import os
import time

from tempfile import NamedTemporaryFile

import threading
import logging

DEBUG = getattr(settings, "DEBUG", False)

if getattr(settings, "USE_PODFILE", False):
    __FILEPICKER__ = True
    from pod.podfile.models import CustomFileModel
    from pod.podfile.models import UserFolder
else:
    __FILEPICKER__ = False
    from pod.main.models import CustomFileModel

EMAIL_ON_TRANSCRIPTING_COMPLETION = getattr(
    settings, "EMAIL_ON_TRANSCRIPTING_COMPLETION", True
)
TRANSCRIPTION_MODEL_PARAM = getattr(settings, "TRANSCRIPTION_MODEL_PARAM", False)
USE_TRANSCRIPTION = getattr(settings, "USE_TRANSCRIPTION", False)
if USE_TRANSCRIPTION:
    TRANSCRIPTION_TYPE = getattr(settings, "TRANSCRIPTION_TYPE", "STT")
TRANSCRIPTION_NORMALIZE = getattr(settings, "TRANSCRIPTION_NORMALIZE", False)
CELERY_TO_ENCODE = getattr(settings, "CELERY_TO_ENCODE", False)

USE_REMOTE_ENCODING_TRANSCODING = getattr(
    settings, "USE_REMOTE_ENCODING_TRANSCODING", False
)
if USE_REMOTE_ENCODING_TRANSCODING:
    from .transcripting_tasks import start_transcripting_task

log = logging.getLogger(__name__)

"""
TO TEST IN THE SHELL -->
from pod.video.transcript import *
stt_model = get_model("fr")
msg, webvtt, all_text = main_stt_transcript(
    "/test/audio_192k_pod.mp3", # file
    177, # file duration
    stt_model # model stt loaded
)
print(webvtt)
"""


# ##########################################################################
# TRANSCRIPT VIDEO: THREAD TO LAUNCH TRANSCRIPT
# ##########################################################################
def start_transcript(video_id, threaded=True):
    """
    Call to start transcript main function.

    Will launch transcript mode depending on configuration.
    """
    if threaded:
        if CELERY_TO_ENCODE:
            task_start_transcript.delay(video_id)
        else:
            log.info("START TRANSCRIPT VIDEO %s" % video_id)
            t = threading.Thread(target=main_threaded_transcript, args=[video_id])
            t.setDaemon(True)
            t.start()
    else:
        main_threaded_transcript(video_id)


def main_threaded_transcript(video_to_encode_id):
    """
    Transcript main function.

    Will check all configuration and file and launch transcript.
    """
    change_encoding_step(video_to_encode_id, 5, "transcripting audio")

    video_to_encode = Video.objects.get(id=video_to_encode_id)
    video_to_encode.encoding_in_progress = True
    video_to_encode.save()
    msg = ""
    lang = video_to_encode.transcript
    # check if TRANSCRIPTION_MODEL_PARAM [lang] exist
    if not TRANSCRIPTION_MODEL_PARAM[TRANSCRIPTION_TYPE].get(lang):
        msg += "\n no stt model found for lang: %s." % lang
        msg += "Please add it in TRANSCRIPTION_MODEL_PARAM."
        change_encoding_step(video_to_encode.id, -1, msg)
        send_email(msg, video_to_encode.id)
    else:
        mp3file = (
            video_to_encode.get_video_mp3().source_file
            if video_to_encode.get_video_mp3()
            else None
        )
        if mp3file is None:
            msg += "\n no mp3 file found for video: %s." % video_to_encode.id
            change_encoding_step(video_to_encode.id, -1, msg)
            send_email(msg, video_to_encode.id)
        else:
            mp3filepath = mp3file.path
            if USE_REMOTE_ENCODING_TRANSCODING:
                start_transcripting_task.delay(
                    video_to_encode.id, mp3filepath, video_to_encode.duration, lang
                )
            else:
                msg, webvtt = start_transcripting(
                    mp3filepath, video_to_encode.duration, lang
                )
                save_vtt_and_notify(video_to_encode, msg, webvtt)
    add_encoding_log(video_to_encode.id, msg)


def save_vtt_and_notify(video_to_encode, msg, webvtt):
    """Call save vtt file function and notify by mail at the end."""
    msg += saveVTT(video_to_encode, webvtt)
    change_encoding_step(video_to_encode.id, 0, "done")
    video_to_encode.encoding_in_progress = False
    video_to_encode.save()
    # envois mail fin transcription
    if EMAIL_ON_TRANSCRIPTING_COMPLETION:
        send_email_transcript(video_to_encode)
    add_encoding_log(video_to_encode.id, msg)


def saveVTT(video, webvtt):
    """Save webvtt file with the video."""
    msg = "\nSAVE TRANSCRIPT WEBVTT: %s" % time.ctime()
    lang = video.transcript
    temp_vtt_file = NamedTemporaryFile(suffix=".vtt")
    webvtt.save(temp_vtt_file.name)
    if webvtt.captions:
        improveCaptionsAccessibility(webvtt)
        msg += "\nstore vtt file in bdd with CustomFileModel model file field"
        if __FILEPICKER__:
            videodir, created = UserFolder.objects.get_or_create(
                name="%s" % video.slug, owner=video.owner
            )
            """
            previousSubtitleFile = CustomFileModel.objects.filter(
                name__startswith="subtitle_%s" % lang,
                folder=videodir,
                created_by=video.owner
            )
            """
            # for subt in previousSubtitleFile:
            #     subt.delete()
            subtitleFile, created = CustomFileModel.objects.get_or_create(
                name="subtitle_%s_%s" % (lang, time.strftime("%Y%m%d-%H%M%S")),
                folder=videodir,
                created_by=video.owner,
            )
            if subtitleFile.file and os.path.isfile(subtitleFile.file.path):
                os.remove(subtitleFile.file.path)
        else:
            subtitleFile, created = CustomFileModel.objects.get_or_create()

        subtitleFile.file.save(
            "subtitle_%s_%s.vtt" % (lang, time.strftime("%Y%m%d-%H%M%S")),
            File(temp_vtt_file),
        )
        msg += "\nstore vtt file in bdd with Track model src field"

        subtitleVtt, created = Track.objects.get_or_create(video=video, lang=lang)
        subtitleVtt.src = subtitleFile
        subtitleVtt.lang = lang
        subtitleVtt.save()
    else:
        msg += "\nERROR SUBTITLES Output size is 0"
    return msg


def improveCaptionsAccessibility(webvtt):
    """
    Parse the vtt file in argument to render the caption conform to accessibility.

    - see `https://github.com/knarf18/Bonnes-pratiques-du-sous-titrage/blob/master/Liste%20de%20bonnes%20pratiques.md` # noqa: E501
    - 40 car maximum per ligne (CPL)
    - 2 lines max by caption

    Args:
        webvtt (:class:`webvtt.WebVTT`): the webvtt file content

    """
    new_captions = []
    for caption in webvtt.captions:
        sent = split_string(caption.text, 40, sep=" ")
        # nb mots total
        nbTotWords = len(caption.text.split())
        if len(sent) > 2:
            num_captions = int(len(sent) / 2)
            if len(sent) % 2:
                num_captions += 1
            dur = caption.end_in_seconds - caption.start_in_seconds
            # On se positionne sur le point de départ en sec
            startTime = caption.start_in_seconds
            for x in range(num_captions):
                new_cap = Caption()
                new_cap.text = get_cap_text(sent, x)
                # Durée d'affichage au prorata du nombre de mots
                timeCalc = dur * (len(new_cap.text.split()) / nbTotWords)
                new_cap.start = sec_to_timestamp(startTime)
                new_cap.end = sec_to_timestamp(startTime + timeCalc)
                startTime = startTime + timeCalc
                new_captions.append(new_cap)
        else:
            new_cap = Caption()
            new_cap.start = caption.start
            new_cap.end = caption.end
            new_cap.text = "\n".join(sent)
            new_captions.append(new_cap)
    # remove all old captions
    while len(webvtt.captions) > 0:
        del webvtt.captions[0]
    # add the new one
    for cap in new_captions:
        webvtt.captions.append(cap)
    webvtt.save()


def get_cap_text(sent, x):
    """
    Get the text in the sent array at the position gived in arg.

    Args:
        sent (list): The list of text
        x (int): The position to extract

    Returns:
        str: The extracted text
    """
    new_cap_text = sent[x * 2]
    try:
        new_cap_text += "\n" + sent[x * 2 + 1]
    except IndexError:
        pass
    return new_cap_text


def pad(line, limit):
    """
    Add some space at the end of line to specified limit.

    Args:
        line (str): A line of text
        limit (int): The size of line

    Returns:
        str: the line with space at the end
    """
    return line + " " * (limit - len(line))


def split_string(text, limit, sep=" "):
    """
    Split text by word for specified limit.

    Args:
        text (str): the text of the caption
        limit (int): size of line
        sep (str): default " "

    Returns:
        array: list of words in the text
    """
    words = text.split()
    if max(map(len, words)) > limit:
        raise ValueError("limit is too small")
    res = []
    part = words[0]
    others = words[1:]
    for word in others:
        if len(sep) + len(word) > limit - len(part):
            res.append(part)
            part = word
        else:
            part += sep + word
    if part:
        res.append(part)
    # add space to the end of line
    result = [pad(line, limit) for line in res]
    return result

1	"""Esup-Pod transcript video functions."""
2
3	from django.conf import settings	1✔
4	from django.core.files import File	1✔
5	from pod.completion.models import Track	1✔
6	from pod.main.tasks import task_start_transcript	1✔
7	from webvtt import Caption	1✔
8
9	from .utils import (	1✔
10	send_email,
11	send_email_transcript,
12	change_encoding_step,
13	add_encoding_log,
14	)
15	from ..video.models import Video	1✔
16	import importlib.util	1✔
17
18	if (	1✔
19	importlib.util.find_spec("vosk") is not None
20	or importlib.util.find_spec("stt") is not None
21	or importlib.util.find_spec("whisper") is not None
22	):
23	from .transcript_model import start_transcripting	×
24
25
26	from .encoding_utils import sec_to_timestamp	1✔
27
28	import os	1✔
29	import time	1✔
30
31	from tempfile import NamedTemporaryFile	1✔
32
33	import threading	1✔
34	import logging	1✔
35
36	DEBUG = getattr(settings, "DEBUG", False)	1✔
37
38	if getattr(settings, "USE_PODFILE", False):	1✔
39	__FILEPICKER__ = True	1✔
40	from pod.podfile.models import CustomFileModel	1✔
41	from pod.podfile.models import UserFolder	1✔
42	else:
43	__FILEPICKER__ = False	×
44	from pod.main.models import CustomFileModel	×
45
46	EMAIL_ON_TRANSCRIPTING_COMPLETION = getattr(	1✔
47	settings, "EMAIL_ON_TRANSCRIPTING_COMPLETION", True
48	)
49	TRANSCRIPTION_MODEL_PARAM = getattr(settings, "TRANSCRIPTION_MODEL_PARAM", False)	1✔
50	USE_TRANSCRIPTION = getattr(settings, "USE_TRANSCRIPTION", False)	1✔
51	if USE_TRANSCRIPTION:	1✔
52	TRANSCRIPTION_TYPE = getattr(settings, "TRANSCRIPTION_TYPE", "STT")	1✔
53	TRANSCRIPTION_NORMALIZE = getattr(settings, "TRANSCRIPTION_NORMALIZE", False)	1✔
54	CELERY_TO_ENCODE = getattr(settings, "CELERY_TO_ENCODE", False)	1✔
55
56	USE_REMOTE_ENCODING_TRANSCODING = getattr(	1✔
57	settings, "USE_REMOTE_ENCODING_TRANSCODING", False
58	)
59	if USE_REMOTE_ENCODING_TRANSCODING:	1✔
60	from .transcripting_tasks import start_transcripting_task	×
61
62	log = logging.getLogger(__name__)	1✔
63
64	"""
65	TO TEST IN THE SHELL -->
66	from pod.video.transcript import *
67	stt_model = get_model("fr")
68	msg, webvtt, all_text = main_stt_transcript(
69	"/test/audio_192k_pod.mp3", # file
70	177, # file duration
71	stt_model # model stt loaded
72	)
73	print(webvtt)
74	"""
75
76
77	# ##########################################################################
78	# TRANSCRIPT VIDEO: THREAD TO LAUNCH TRANSCRIPT
79	# ##########################################################################
80	def start_transcript(video_id, threaded=True):	1✔
81	"""
82	Call to start transcript main function.
83
84	Will launch transcript mode depending on configuration.
85	"""
86	if threaded:	1✔
87	if CELERY_TO_ENCODE:	1✔
88	task_start_transcript.delay(video_id)	×
89	else:
90	log.info("START TRANSCRIPT VIDEO %s" % video_id)	1✔
91	t = threading.Thread(target=main_threaded_transcript, args=[video_id])	1✔
92	t.setDaemon(True)	1✔
93	t.start()	1✔
94	else:
95	main_threaded_transcript(video_id)	×
96
97
98	def main_threaded_transcript(video_to_encode_id):	1✔
99	"""
100	Transcript main function.
101
102	Will check all configuration and file and launch transcript.
103	"""
104	change_encoding_step(video_to_encode_id, 5, "transcripting audio")	1✔
105
106	video_to_encode = Video.objects.get(id=video_to_encode_id)	×
107	video_to_encode.encoding_in_progress = True	×
108	video_to_encode.save()	×
109	msg = ""	×
110	lang = video_to_encode.transcript	×
111	# check if TRANSCRIPTION_MODEL_PARAM [lang] exist
112	if not TRANSCRIPTION_MODEL_PARAM[TRANSCRIPTION_TYPE].get(lang):	×
113	msg += "\n no stt model found for lang: %s." % lang	×
114	msg += "Please add it in TRANSCRIPTION_MODEL_PARAM."	×
115	change_encoding_step(video_to_encode.id, -1, msg)	×
116	send_email(msg, video_to_encode.id)	×
117	else:
118	mp3file = (	×
119	video_to_encode.get_video_mp3().source_file
120	if video_to_encode.get_video_mp3()
121	else None
122	)
123	if mp3file is None:	×
124	msg += "\n no mp3 file found for video: %s." % video_to_encode.id	×
125	change_encoding_step(video_to_encode.id, -1, msg)	×
126	send_email(msg, video_to_encode.id)	×
127	else:
128	mp3filepath = mp3file.path	×
129	if USE_REMOTE_ENCODING_TRANSCODING:	×
130	start_transcripting_task.delay(	×
131	video_to_encode.id, mp3filepath, video_to_encode.duration, lang
132	)
133	else:
134	msg, webvtt = start_transcripting(	×
135	mp3filepath, video_to_encode.duration, lang
136	)
137	save_vtt_and_notify(video_to_encode, msg, webvtt)	×
138	add_encoding_log(video_to_encode.id, msg)	×
139
140
141	def save_vtt_and_notify(video_to_encode, msg, webvtt):	1✔
142	"""Call save vtt file function and notify by mail at the end."""
143	msg += saveVTT(video_to_encode, webvtt)	×
144	change_encoding_step(video_to_encode.id, 0, "done")	×
145	video_to_encode.encoding_in_progress = False	×
146	video_to_encode.save()	×
147	# envois mail fin transcription
148	if EMAIL_ON_TRANSCRIPTING_COMPLETION:	×
149	send_email_transcript(video_to_encode)	×
150	add_encoding_log(video_to_encode.id, msg)	×
151
152
153	def saveVTT(video, webvtt):	1✔
154	"""Save webvtt file with the video."""
NEW 155	msg = "\nSAVE TRANSCRIPT WEBVTT: %s" % time.ctime()	×
156	lang = video.transcript	×
157	temp_vtt_file = NamedTemporaryFile(suffix=".vtt")	×
158	webvtt.save(temp_vtt_file.name)	×
159	if webvtt.captions:	×
160	improveCaptionsAccessibility(webvtt)	×
161	msg += "\nstore vtt file in bdd with CustomFileModel model file field"	×
162	if __FILEPICKER__:	×
163	videodir, created = UserFolder.objects.get_or_create(	×
164	name="%s" % video.slug, owner=video.owner
165	)
166	"""
167	previousSubtitleFile = CustomFileModel.objects.filter(
168	name__startswith="subtitle_%s" % lang,
169	folder=videodir,
170	created_by=video.owner
171	)
172	"""
173	# for subt in previousSubtitleFile:
174	# subt.delete()
175	subtitleFile, created = CustomFileModel.objects.get_or_create(	×
176	name="subtitle_%s_%s" % (lang, time.strftime("%Y%m%d-%H%M%S")),
177	folder=videodir,
178	created_by=video.owner,
179	)
180	if subtitleFile.file and os.path.isfile(subtitleFile.file.path):	×
181	os.remove(subtitleFile.file.path)	×
182	else:
183	subtitleFile, created = CustomFileModel.objects.get_or_create()	×
184
185	subtitleFile.file.save(	×
186	"subtitle_%s_%s.vtt" % (lang, time.strftime("%Y%m%d-%H%M%S")),
187	File(temp_vtt_file),
188	)
189	msg += "\nstore vtt file in bdd with Track model src field"	×
190
191	subtitleVtt, created = Track.objects.get_or_create(video=video, lang=lang)	×
192	subtitleVtt.src = subtitleFile	×
193	subtitleVtt.lang = lang	×
194	subtitleVtt.save()	×
195	else:
196	msg += "\nERROR SUBTITLES Output size is 0"	×
197	return msg	×
198
199
200	def improveCaptionsAccessibility(webvtt):	1✔
201	"""
202	Parse the vtt file in argument to render the caption conform to accessibility.
203
204	- see `https://github.com/knarf18/Bonnes-pratiques-du-sous-titrage/blob/master/Liste%20de%20bonnes%20pratiques.md` # noqa: E501
205	- 40 car maximum per ligne (CPL)
206	- 2 lines max by caption
207
208	Args:
209	webvtt (:class:`webvtt.WebVTT`): the webvtt file content
210
211	"""
212	new_captions = []	×
213	for caption in webvtt.captions:	×
214	sent = split_string(caption.text, 40, sep=" ")	×
215	# nb mots total
216	nbTotWords = len(caption.text.split())	×
217	if len(sent) > 2:	×
218	num_captions = int(len(sent) / 2)	×
219	if len(sent) % 2:	×
220	num_captions += 1	×
221	dur = caption.end_in_seconds - caption.start_in_seconds	×
222	# On se positionne sur le point de départ en sec
223	startTime = caption.start_in_seconds	×
224	for x in range(num_captions):	×
225	new_cap = Caption()	×
226	new_cap.text = get_cap_text(sent, x)	×
227	# Durée d'affichage au prorata du nombre de mots
228	timeCalc = dur * (len(new_cap.text.split()) / nbTotWords)	×
229	new_cap.start = sec_to_timestamp(startTime)	×
230	new_cap.end = sec_to_timestamp(startTime + timeCalc)	×
231	startTime = startTime + timeCalc	×
232	new_captions.append(new_cap)	×
233	else:
234	new_cap = Caption()	×
235	new_cap.start = caption.start	×
236	new_cap.end = caption.end	×
237	new_cap.text = "\n".join(sent)	×
238	new_captions.append(new_cap)	×
239	# remove all old captions
240	while len(webvtt.captions) > 0:	×
241	del webvtt.captions[0]	×
242	# add the new one
243	for cap in new_captions:	×
244	webvtt.captions.append(cap)	×
245	webvtt.save()	×
246
247
248	def get_cap_text(sent, x):	1✔
249	"""
250	Get the text in the sent array at the position gived in arg.
251
252	Args:
253	sent (list): The list of text
254	x (int): The position to extract
255
256	Returns:
257	str: The extracted text
258	"""
259	new_cap_text = sent[x * 2]	×
260	try:	×
261	new_cap_text += "\n" + sent[x * 2 + 1]	×
262	except IndexError:	×
263	pass	×
264	return new_cap_text	×
265
266
267	def pad(line, limit):	1✔
268	"""
269	Add some space at the end of line to specified limit.
270
271	Args:
272	line (str): A line of text
273	limit (int): The size of line
274
275	Returns:
276	str: the line with space at the end
277	"""
278	return line + " " * (limit - len(line))	×
279
280
281	def split_string(text, limit, sep=" "):	1✔
282	"""
283	Split text by word for specified limit.
284
285	Args:
286	text (str): the text of the caption
287	limit (int): size of line
288	sep (str): default " "
289
290	Returns:
291	array: list of words in the text
292	"""
293	words = text.split()	×
294	if max(map(len, words)) > limit:	×
295	raise ValueError("limit is too small")	×
296	res = []	×
297	part = words[0]	×
298	others = words[1:]	×
299	for word in others:	×
300	if len(sep) + len(word) > limit - len(part):	×
301	res.append(part)	×
302	part = word	×
303	else:
304	part += sep + word	×
305	if part:	×
306	res.append(part)	×
307	# add space to the end of line
308	result = [pad(line, limit) for line in res]	×
309	return result	×

EsupPortail / Esup-Pod / 8755927956

Source File Press 'n' to go to next uncovered line, 'b' for previous

Source File
Press 'n' to go to next uncovered line, 'b' for previous