-
Notifications
You must be signed in to change notification settings - Fork 0
/
process.py
111 lines (96 loc) 路 3.96 KB
/
process.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
import datetime
import json
import os
import requests
import scrapetube
from timelength import TimeLength
from youtube_transcript_api import YouTubeTranscriptApi, _errors
def handler():
# Get a list of all playlists in the channel
api_key = os.environ["YOUTUBE_V3_API_KEY"]
channel_id = "UCxEgOKuI-n-WOJaNcisHvSg"
playlists = requests.get(
"https://www.googleapis.com/youtube/v3/playlists",
params={
"part": "snippet",
"channelId": channel_id,
"key": api_key,
"maxResults": 50,
},
)
playlists = playlists.json()
# Make sure there are not more than 50 playlists in the results
# since pagination has not been implemented
assert not playlists.get("nextPageToken")
# Process videos in playlists
for playlist in playlists["items"]:
playlist_id = playlist["id"]
videos = scrapetube.get_playlist(playlist_id)
for video in videos:
video["playlist_id"] = playlist_id
video["playlist_title"] = playlist["snippet"]["title"]
_process_video(video)
# Process videos in the channel not part of playlists
videos = scrapetube.get_channel(channel_id)
for video in videos:
video["playlist_id"] = None
video["playlist_title"] = None
_process_video(video)
def _process_video(video_metadata):
video_id = video_metadata["videoId"]
# Check if video has already been processed
processed_local_path = f"data/{video_id}.json"
if os.path.isfile(processed_local_path):
return
failed_path = f"failed/{video_id}.json"
if os.path.isfile(failed_path):
return
# Retrieve or generate transcriptions
try:
transcription_with_timestamps = YouTubeTranscriptApi.get_transcript(
video_id, languages=["es"]
)
except _errors.TranscriptsDisabled:
print(f"Transcripts are disabled for video {video_id}")
with open(failed_path, "w") as _file:
json.dump(video_metadata, _file, indent=4)
return
# Language for some videos is not Spanish - ES
# Example: https://www.youtube.com/watch?v=k_rBgKb1y8U
except _errors.NoTranscriptFound:
print(f"No transcript available for video {video_id}")
with open(failed_path, "w") as _file:
json.dump(video_metadata, _file, indent=4)
return
transcription_text = ""
for part in transcription_with_timestamps:
transcription_text += f" {part['text']} "
transcription_text = transcription_text.replace(" ", " ")
transcription_text = transcription_text.strip()
if not video_metadata.get("videoInfo"):
published_time_text = video_metadata["publishedTimeText"]["simpleText"]
video_length = video_metadata["lengthText"]["accessibility"]["accessibilityData"]["label"]
video_length_seconds = TimeLength(video_length).total_seconds
video_length_seconds = int(video_length_seconds)
else:
published_time_text = video_metadata["videoInfo"]["runs"][-1]["text"]
video_length_seconds = int(video_metadata["lengthSeconds"])
video = {
"video_id": video_id,
"video_thumbnail_url": video_metadata["thumbnail"]["thumbnails"][-1]["url"],
"video_url": f"https://www.youtube.com/watch?v={video_id}",
"video_title": video_metadata["title"]["runs"][-1]["text"],
"video_length_seconds": video_length_seconds,
"transcription_with_timestamps": transcription_with_timestamps,
"transcription_text": transcription_text,
"transcription_source": "YouTube auto-generated captions",
"playlist_id": video_metadata["playlist_id"],
"playlist_title": video_metadata["playlist_title"],
"published_time_text": published_time_text,
"retrieved_time": str(datetime.datetime.utcnow()),
}
with open(processed_local_path, "w") as _file:
json.dump(video, _file, indent=4)
return video
if __name__ == "__main__":
handler()